[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH RFC 24/44] x86/mapcache: Reimplement map_domain_page() from scratch



There are two reasons:
 1) To stop using the per-domain range for the mapcache
 2) To make map_domain_page() safe to use during context switches

The new implementation is entirely percpu and rather more simple.  See the
comment at the top of domain_page.c for a description of the algorithm.

A side effect of the new implementation is that we can get rid of struct
mapcache_{vcpu,domain} entirely, and mapcache_override_current().

Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>

TODO: Consider whether to try and lazily unmap, utilising other TLB flush
scenarios rather than forcing an invlpg on each unmap.
---
 xen/arch/x86/domain.c        |   6 -
 xen/arch/x86/domain_page.c   | 353 +++++++++++++------------------------------
 xen/arch/x86/pv/dom0_build.c |   3 -
 xen/include/asm-x86/config.h |   7 -
 xen/include/asm-x86/domain.h |  42 -----
 5 files changed, 106 insertions(+), 305 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 93e81c0..3d9e7fb 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -324,10 +324,6 @@ int vcpu_initialise(struct vcpu *v)
 
     v->arch.flags = TF_kernel_mode;
 
-    rc = mapcache_vcpu_init(v);
-    if ( rc )
-        return rc;
-
     if ( !is_idle_domain(d) )
     {
         paging_vcpu_init(v);
@@ -478,8 +474,6 @@ int arch_domain_create(struct domain *d, unsigned int 
domcr_flags,
         d->arch.emulation_flags = emflags;
     }
 
-    mapcache_domain_init(d);
-
     HYPERVISOR_COMPAT_VIRT_START(d) =
         is_pv_domain(d) ? __HYPERVISOR_COMPAT_VIRT_START : ~0u;
 
diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index 8f2bcd4..c17ff66 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -18,291 +18,131 @@
 #include <asm/hardirq.h>
 #include <asm/setup.h>
 
-static DEFINE_PER_CPU(struct vcpu *, override);
+/*
+ * Global mapcache entries are implemented using the vmap() infrastructure.
+ *
+ * Local mapcache entries are implemented with a percpu linear range, starting
+ * at PERCPU_MAPCACHE_START.  The maximum number of concurrent mappings we
+ * expect to use (NR_MAPCACHE_SLOTS) is for a nested pagewalk.  Being a small
+ * number, allocations are tracked with a simple bitmap (inuse).
+ *
+ * There is plenty of linear address space to use, so addresses are handed out
+ * by index into the inuse bitmap, with unmapped guard pages inbetween, to
+ * help catch bounds errors in the code using the mappings.
+ *
+ * It is *not* safe to pass local mapcache mappings to other CPUs to use.
+ */
 
-static inline struct vcpu *mapcache_current_vcpu(void)
-{
-    /* In the common case we use the mapcache of the running VCPU. */
-    struct vcpu *v = this_cpu(override) ?: current;
+struct mapcache_info {
+#define NR_MAPCACHE_SLOTS (CONFIG_PAGING_LEVELS * CONFIG_PAGING_LEVELS)
+    unsigned long inuse;
+};
+static DEFINE_PER_CPU(struct mapcache_info, mapcache_info);
 
-    /*
-     * When current isn't properly set up yet, this is equivalent to
-     * running in an idle vCPU (callers must check for NULL).
-     */
-    if ( v == INVALID_VCPU )
-        return NULL;
+static unsigned long mapcache_idx_to_linear(unsigned int idx)
+{
+    return PERCPU_MAPCACHE_START + pfn_to_paddr(idx * 2 + 1);
+}
 
-    /*
-     * When using efi runtime page tables, we have the equivalent of the idle
-     * domain's page tables but current may point at another domain's VCPU.
-     * Return NULL as though current is not properly set up yet.
-     */
-    if ( efi_rs_using_pgtables() )
-        return NULL;
+static unsigned int mapcache_linear_to_idx(unsigned long linear)
+{
+    return paddr_to_pfn(linear - PERCPU_MAPCACHE_START) / 2;
+}
 
-    /*
-     * If guest_table is NULL, and we are running a paravirtualised guest,
-     * then it means we are running on the idle domain's page table and must
-     * therefore use its mapcache.
-     */
-    if ( unlikely(pagetable_is_null(v->arch.guest_table)) && is_pv_vcpu(v) )
-    {
-        /* If we really are idling, perform lazy context switch now. */
-        if ( (v = idle_vcpu[smp_processor_id()]) == current )
-            sync_local_execstate();
-        /* We must now be running on the idle page table. */
-        ASSERT(read_cr3() == this_cpu(percpu_idle_pt));
-    }
+static l1_pgentry_t *mapcache_l1e(unsigned long linear)
+{
+    l1_pgentry_t *l1t = (l1_pgentry_t *)PERCPU_MAPCACHE_L1ES;
 
-    return v;
+    return &l1t[l1_table_offset(linear)];
 }
 
-void __init mapcache_override_current(struct vcpu *v)
+/*
+ * Look up a mapcache entry, based on a linear address, ASSERT()ing that it is
+ * bounded senibly and in use.
+ */
+static l1_pgentry_t *lookup_inuse_mapcache_entry(
+    unsigned long linear, unsigned int *p_idx)
 {
-    this_cpu(override) = v;
-}
+    unsigned int idx;
+    l1_pgentry_t *pl1e;
 
-#define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER)
-#define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1)
-#define MAPCACHE_L1ENT(idx) \
-    __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + 
pfn_to_paddr(idx))]
+    ASSERT(linear >= PERCPU_MAPCACHE_START && linear < PERCPU_MAPCACHE_END);
+
+    idx = mapcache_linear_to_idx(linear);
+    ASSERT(idx < NR_MAPCACHE_SLOTS);
+    ASSERT(test_bit(idx, &this_cpu(mapcache_info).inuse));
+
+    if ( p_idx )
+        *p_idx = idx;
+
+    pl1e = mapcache_l1e(linear);
+    ASSERT(l1e_get_flags(*pl1e) & _PAGE_PRESENT);
+
+    return pl1e;
+}
 
 void *map_domain_page(mfn_t mfn)
 {
-    unsigned long flags;
-    unsigned int idx, i;
-    struct vcpu *v;
-    struct mapcache_domain *dcache;
-    struct mapcache_vcpu *vcache;
-    struct vcpu_maphash_entry *hashent;
+    unsigned long flags, linear;
+    unsigned int idx;
+    struct mapcache_info *mci = &this_cpu(mapcache_info);
+    l1_pgentry_t *pl1e;
 
 #ifdef NDEBUG
     if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
         return mfn_to_virt(mfn_x(mfn));
 #endif
 
-    v = mapcache_current_vcpu();
-    if ( !v || !is_pv_vcpu(v) )
+    if ( this_cpu(curr_extended_directmap) )
         return mfn_to_virt(mfn_x(mfn));
 
-    dcache = &v->domain->arch.pv_domain.mapcache;
-    vcache = &v->arch.pv_vcpu.mapcache;
-    if ( !dcache->inuse )
-        return mfn_to_virt(mfn_x(mfn));
-
-    perfc_incr(map_domain_page_count);
-
+    /*
+     * map_domain_page() is used from many contexts, including fault handlers.
+     * Disable interrupts to keep the inuse bitmap consistent with the l1t.
+     *
+     * Be aware! Any #PF inside this region will most likely recurse with the
+     * spurious pagefault handler until the BUG_ON() is hit.
+     */
     local_irq_save(flags);
 
-    hashent = &vcache->hash[MAPHASH_HASHFN(mfn_x(mfn))];
-    if ( hashent->mfn == mfn_x(mfn) )
-    {
-        idx = hashent->idx;
-        ASSERT(idx < dcache->entries);
-        hashent->refcnt++;
-        ASSERT(hashent->refcnt);
-        ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(idx)) == mfn_x(mfn));
-        goto out;
-    }
-
-    spin_lock(&dcache->lock);
-
-    /* Has some other CPU caused a wrap? We must flush if so. */
-    if ( unlikely(dcache->epoch != vcache->shadow_epoch) )
-    {
-        vcache->shadow_epoch = dcache->epoch;
-        if ( NEED_FLUSH(this_cpu(tlbflush_time), dcache->tlbflush_timestamp) )
-        {
-            perfc_incr(domain_page_tlb_flush);
-            flush_tlb_local();
-        }
-    }
+    idx = find_first_zero_bit(&mci->inuse, NR_MAPCACHE_SLOTS);
+    BUG_ON(idx == NR_MAPCACHE_SLOTS);
 
-    idx = find_next_zero_bit(dcache->inuse, dcache->entries, dcache->cursor);
-    if ( unlikely(idx >= dcache->entries) )
-    {
-        unsigned long accum = 0, prev = 0;
-
-        /* /First/, clean the garbage map and update the inuse list. */
-        for ( i = 0; i < BITS_TO_LONGS(dcache->entries); i++ )
-        {
-            accum |= prev;
-            dcache->inuse[i] &= ~xchg(&dcache->garbage[i], 0);
-            prev = ~dcache->inuse[i];
-        }
-
-        if ( accum | (prev & BITMAP_LAST_WORD_MASK(dcache->entries)) )
-            idx = find_first_zero_bit(dcache->inuse, dcache->entries);
-        else
-        {
-            /* Replace a hash entry instead. */
-            i = MAPHASH_HASHFN(mfn_x(mfn));
-            do {
-                hashent = &vcache->hash[i];
-                if ( hashent->idx != MAPHASHENT_NOTINUSE && !hashent->refcnt )
-                {
-                    idx = hashent->idx;
-                    ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(idx)) == hashent->mfn);
-                    l1e_write(&MAPCACHE_L1ENT(idx), l1e_empty());
-                    hashent->idx = MAPHASHENT_NOTINUSE;
-                    hashent->mfn = ~0UL;
-                    break;
-                }
-                if ( ++i == MAPHASH_ENTRIES )
-                    i = 0;
-            } while ( i != MAPHASH_HASHFN(mfn_x(mfn)) );
-        }
-        BUG_ON(idx >= dcache->entries);
-
-        /* /Second/, flush TLBs. */
-        perfc_incr(domain_page_tlb_flush);
-        flush_tlb_local();
-        vcache->shadow_epoch = ++dcache->epoch;
-        dcache->tlbflush_timestamp = tlbflush_current_time();
-    }
+    __set_bit(idx, &mci->inuse);
 
-    set_bit(idx, dcache->inuse);
-    dcache->cursor = idx + 1;
+    linear = mapcache_idx_to_linear(idx);
+    pl1e = mapcache_l1e(linear);
 
-    spin_unlock(&dcache->lock);
+    ASSERT(!(l1e_get_flags(*pl1e) & _PAGE_PRESENT));
+    *pl1e = l1e_from_mfn(mfn, __PAGE_HYPERVISOR_RW);
+    barrier(); /* Ensure the pagetable is updated before enabling interrupts. 
*/
 
-    l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_mfn(mfn, __PAGE_HYPERVISOR_RW));
-
- out:
     local_irq_restore(flags);
-    return (void *)MAPCACHE_VIRT_START + pfn_to_paddr(idx);
+
+    return (void *)linear;
 }
 
 void unmap_domain_page(const void *ptr)
 {
+    struct mapcache_info *mci = &this_cpu(mapcache_info);
+    unsigned long flags, linear = (unsigned long)ptr;
     unsigned int idx;
-    struct vcpu *v;
-    struct mapcache_domain *dcache;
-    unsigned long va = (unsigned long)ptr, mfn, flags;
-    struct vcpu_maphash_entry *hashent;
+    l1_pgentry_t *pl1e;
 
-    if ( va >= DIRECTMAP_VIRT_START )
+    if ( linear >= DIRECTMAP_VIRT_START )
         return;
 
-    ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
-
-    v = mapcache_current_vcpu();
-    ASSERT(v && is_pv_vcpu(v));
-
-    dcache = &v->domain->arch.pv_domain.mapcache;
-    ASSERT(dcache->inuse);
-
-    idx = PFN_DOWN(va - MAPCACHE_VIRT_START);
-    mfn = l1e_get_pfn(MAPCACHE_L1ENT(idx));
-    hashent = &v->arch.pv_vcpu.mapcache.hash[MAPHASH_HASHFN(mfn)];
+    pl1e = lookup_inuse_mapcache_entry(linear, &idx);
 
     local_irq_save(flags);
 
-    if ( hashent->idx == idx )
-    {
-        ASSERT(hashent->mfn == mfn);
-        ASSERT(hashent->refcnt);
-        hashent->refcnt--;
-    }
-    else if ( !hashent->refcnt )
-    {
-        if ( hashent->idx != MAPHASHENT_NOTINUSE )
-        {
-            /* /First/, zap the PTE. */
-            ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(hashent->idx)) ==
-                   hashent->mfn);
-            l1e_write(&MAPCACHE_L1ENT(hashent->idx), l1e_empty());
-            /* /Second/, mark as garbage. */
-            set_bit(hashent->idx, dcache->garbage);
-        }
-
-        /* Add newly-freed mapping to the maphash. */
-        hashent->mfn = mfn;
-        hashent->idx = idx;
-    }
-    else
-    {
-        /* /First/, zap the PTE. */
-        l1e_write(&MAPCACHE_L1ENT(idx), l1e_empty());
-        /* /Second/, mark as garbage. */
-        set_bit(idx, dcache->garbage);
-    }
+    *pl1e = l1e_empty();
+    asm volatile ( "invlpg %0" :: "m" (*(char *)ptr) : "memory" );
+    __clear_bit(idx, &mci->inuse);
 
     local_irq_restore(flags);
 }
 
-int mapcache_domain_init(struct domain *d)
-{
-    struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache;
-    unsigned int bitmap_pages;
-
-    if ( !is_pv_domain(d) || is_idle_domain(d) )
-        return 0;
-
-#ifdef NDEBUG
-    if ( !mem_hotplug && max_page <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
-        return 0;
-#endif
-
-    BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
-                 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
-                 MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
-    bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
-    dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
-    dcache->garbage = dcache->inuse +
-                      (bitmap_pages + 1) * PAGE_SIZE / sizeof(long);
-
-    spin_lock_init(&dcache->lock);
-
-    return create_perdomain_mapping(d, (unsigned long)dcache->inuse,
-                                    2 * bitmap_pages + 1,
-                                    NIL(l1_pgentry_t *), NULL);
-}
-
-int mapcache_vcpu_init(struct vcpu *v)
-{
-    struct domain *d = v->domain;
-    struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache;
-    unsigned long i;
-    unsigned int ents = d->max_vcpus * MAPCACHE_VCPU_ENTRIES;
-    unsigned int nr = PFN_UP(BITS_TO_LONGS(ents) * sizeof(long));
-
-    if ( !is_pv_vcpu(v) || !dcache->inuse )
-        return 0;
-
-    if ( ents > dcache->entries )
-    {
-        /* Populate page tables. */
-        int rc = create_perdomain_mapping(d, MAPCACHE_VIRT_START, ents,
-                                          NIL(l1_pgentry_t *), NULL);
-
-        /* Populate bit maps. */
-        if ( !rc )
-            rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse,
-                                          nr, NULL, NIL(struct page_info *));
-        if ( !rc )
-            rc = create_perdomain_mapping(d, (unsigned long)dcache->garbage,
-                                          nr, NULL, NIL(struct page_info *));
-
-        if ( rc )
-            return rc;
-
-        dcache->entries = ents;
-    }
-
-    /* Mark all maphash entries as not in use. */
-    BUILD_BUG_ON(MAPHASHENT_NOTINUSE < MAPCACHE_ENTRIES);
-    for ( i = 0; i < MAPHASH_ENTRIES; i++ )
-    {
-        struct vcpu_maphash_entry *hashent = &v->arch.pv_vcpu.mapcache.hash[i];
-
-        hashent->mfn = ~0UL; /* never valid to map */
-        hashent->idx = MAPHASHENT_NOTINUSE;
-    }
-
-    return 0;
-}
-
 void *map_domain_page_global(mfn_t mfn)
 {
     ASSERT(!in_irq() &&
@@ -345,10 +185,29 @@ unsigned long domain_page_map_to_mfn(const void *ptr)
         BUG_ON(!pl1e);
     }
     else
-    {
-        ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
-        pl1e = &__linear_l1_table[l1_linear_offset(va)];
-    }
+        pl1e = lookup_inuse_mapcache_entry(va, NULL);
 
     return l1e_get_pfn(*pl1e);
 }
+
+static __init __maybe_unused void build_assertions(void)
+{
+    struct mapcache_info info;
+
+    /* NR_MAPCACHE_SLOTS within the bounds of the inuse bitmap? */
+    BUILD_BUG_ON(NR_MAPCACHE_SLOTS > (sizeof(info.inuse) * 8));
+
+    /* Enough linear address space, including guard pages? */
+    BUILD_BUG_ON((NR_MAPCACHE_SLOTS * 2) >
+                 (PERCPU_MAPCACHE_END - PERCPU_MAPCACHE_START));
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c
index 09c765a..3baf37b 100644
--- a/xen/arch/x86/pv/dom0_build.c
+++ b/xen/arch/x86/pv/dom0_build.c
@@ -698,7 +698,6 @@ int __init dom0_construct_pv(struct domain *d,
 
     /* We run on dom0's page tables for the final part of the build process. */
     write_ptbase(v);
-    mapcache_override_current(v);
 
     /* Copy the OS image and free temporary buffer. */
     elf.dest_base = (void*)vkern_start;
@@ -717,7 +716,6 @@ int __init dom0_construct_pv(struct domain *d,
         if ( (parms.virt_hypercall < v_start) ||
              (parms.virt_hypercall >= v_end) )
         {
-            mapcache_override_current(NULL);
             write_ptbase(current);
             printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
             rc = -1;
@@ -838,7 +836,6 @@ int __init dom0_construct_pv(struct domain *d,
         xlat_start_info(si, XLAT_start_info_console_dom0);
 
     /* Return to idle domain's page tables. */
-    mapcache_override_current(NULL);
     write_ptbase(current);
 
     update_domain_wallclock_time(d);
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index a95f8c8..f78cbde 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -314,13 +314,6 @@ extern unsigned long xen_phys_start;
 #define LDT_VIRT_START(v)    \
     (GDT_VIRT_START(v) + (64*1024))
 
-/* map_domain_page() map cache. The second per-domain-mapping sub-area. */
-#define MAPCACHE_VCPU_ENTRIES    (CONFIG_PAGING_LEVELS * CONFIG_PAGING_LEVELS)
-#define MAPCACHE_ENTRIES         (MAX_VIRT_CPUS * MAPCACHE_VCPU_ENTRIES)
-#define MAPCACHE_VIRT_START      PERDOMAIN_VIRT_SLOT(1)
-#define MAPCACHE_VIRT_END        (MAPCACHE_VIRT_START + \
-                                  MAPCACHE_ENTRIES * PAGE_SIZE)
-
 /* Argument translation area. The third per-domain-mapping sub-area. */
 #define ARG_XLAT_VIRT_START      PERDOMAIN_VIRT_SLOT(2)
 /* Allow for at least one guard page (COMPAT_ARG_XLAT_SIZE being 2 pages): */
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index f699119..fa57c93 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -38,42 +38,6 @@ struct trap_bounce {
     unsigned long eip;
 };
 
-#define MAPHASH_ENTRIES 8
-#define MAPHASH_HASHFN(pfn) ((pfn) & (MAPHASH_ENTRIES-1))
-#define MAPHASHENT_NOTINUSE ((u32)~0U)
-struct mapcache_vcpu {
-    /* Shadow of mapcache_domain.epoch. */
-    unsigned int shadow_epoch;
-
-    /* Lock-free per-VCPU hash of recently-used mappings. */
-    struct vcpu_maphash_entry {
-        unsigned long mfn;
-        uint32_t      idx;
-        uint32_t      refcnt;
-    } hash[MAPHASH_ENTRIES];
-};
-
-struct mapcache_domain {
-    /* The number of array entries, and a cursor into the array. */
-    unsigned int entries;
-    unsigned int cursor;
-
-    /* Protects map_domain_page(). */
-    spinlock_t lock;
-
-    /* Garbage mappings are flushed from TLBs in batches called 'epochs'. */
-    unsigned int epoch;
-    u32 tlbflush_timestamp;
-
-    /* Which mappings are in use, and which are garbage to reap next epoch? */
-    unsigned long *inuse;
-    unsigned long *garbage;
-};
-
-int mapcache_domain_init(struct domain *);
-int mapcache_vcpu_init(struct vcpu *);
-void mapcache_override_current(struct vcpu *);
-
 /* x86/64: toggle guest between kernel and user modes. */
 void toggle_guest_mode(struct vcpu *);
 /* x86/64: toggle guest page tables between kernel and user modes. */
@@ -251,9 +215,6 @@ struct pv_domain
 
     atomic_t nr_l4_pages;
 
-    /* map_domain_page() mapping cache. */
-    struct mapcache_domain mapcache;
-
     struct cpuidmasks *cpuidmasks;
 };
 
@@ -444,9 +405,6 @@ struct arch_domain
 
 struct pv_vcpu
 {
-    /* map_domain_page() mapping cache. */
-    struct mapcache_vcpu mapcache;
-
     struct trap_info *trap_ctxt;
 
     unsigned long gdt_frames[FIRST_RESERVED_GDT_PAGE];
-- 
2.1.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.