|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH RFC 26/44] x86/pt-shadow: Maintain a small cache of shadowed frames
This improves the shadowing performance substantially. In particular, system
calls for 64bit PV guests (which switch between the user and kernel
pagetables) no longer suffer a 4K copy hit in both directions.
See the code comments for reasoning and the algorithm description.
Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
xen/arch/x86/mm.c | 2 +
xen/arch/x86/mm/shadow/multi.c | 2 +
xen/arch/x86/pv/pt-shadow.c | 196 ++++++++++++++++++++++++++++++++-----
xen/include/asm-x86/pv/pt-shadow.h | 9 ++
4 files changed, 186 insertions(+), 23 deletions(-)
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index d5c69c0..f8f15e9 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -2413,6 +2413,8 @@ int free_page_type(struct page_info *page, unsigned long
type,
case PGT_l4_page_table:
ASSERT(preemptible);
rc = free_l4_table(page);
+ if ( !rc )
+ pt_shadow_l4_invlpg(owner, page);
break;
default:
gdprintk(XENLOG_WARNING, "type %" PRtype_info " mfn %" PRI_mfn "\n",
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 9c929ed..f9ec5aa 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -1895,6 +1895,8 @@ void sh_destroy_l4_shadow(struct domain *d, mfn_t smfn)
}
});
+ pt_shadow_l4_invlpg(d, sp);
+
/* Put the memory back in the pool */
shadow_free(d, smfn);
}
diff --git a/xen/arch/x86/pv/pt-shadow.c b/xen/arch/x86/pv/pt-shadow.c
index 33cb303..b4f2b86 100644
--- a/xen/arch/x86/pv/pt-shadow.c
+++ b/xen/arch/x86/pv/pt-shadow.c
@@ -24,6 +24,10 @@
#include <asm/pv/pt-shadow.h>
+/* Override macros from asm/mm.h to make them work with mfn_t */
+#undef page_to_mfn
+#define page_to_mfn(pg) _mfn(__page_to_mfn(pg))
+
/*
* To use percpu linear ranges, we require that no two pcpus have %cr3
* pointing at the same L4 pagetable at the same time.
@@ -38,19 +42,44 @@
*
* The algorithm is fairly simple.
*
+ * - A small cache of shadowed L4s from the same guest is maintained.
* - When a pcpu is switching to a new vcpu cr3 and shadowing is necessary,
- * perform a full 4K copy of the guests frame into a percpu frame, and run
- * on that.
+ * the cache is searched.
+ * - If the new cr3 is already cached, use our existing shadow.
+ * - If not, drop an entry and shadow the new frame with a full 4K copy.
* - When a write to a guests L4 pagetable occurs, the update must be
* propagated to all existing shadows. An IPI is sent to the domains
* dirty mask indicating which frame/slot was updated, and each pcpu
* checks to see whether it needs to sync the update into its shadow.
+ * - When a guest L4 pagetable is freed, it must be dropped from any caches,
+ * as Xen will allow it to become writeable to the guest again, and its
+ * contents will go stale. It uses the same IPI mechanism as for writes.
+ */
+
+#define L4_SHADOW_ORDER 2
+#define NR_L4_SHADOWS (1ul << L4_SHADOW_ORDER)
+
+/*
+ * An individual cache entry. Contains a %cr3 which has been cached, and the
+ * index of this entry into the shadow frames.
+ *
+ * The layout relies on %cr3 being page aligned, with the index stored in the
+ * lower bits. idx could be a smaller bitfield, but there is no other
+ * information to store, and having it as an 8bit field results in better
+ * compiled code.
*/
+typedef union pt_cache_entry {
+ unsigned long raw;
+ struct {
+ uint8_t idx;
+ unsigned long :4, cr3_mfn:52;
+ };
+} pt_cache_entry_t;
struct pt_shadow {
/*
- * A frame used to shadow a vcpus intended pagetable. When shadowing,
- * this frame is the one actually referenced by %cr3.
+ * A cache of frames used to shadow a vcpus intended pagetables. When
+ * shadowing, one of these frames is the one actually referenced by %cr3.
*/
paddr_t shadow_l4;
l4_pgentry_t *shadow_l4_va;
@@ -63,29 +92,60 @@ struct pt_shadow {
*/
const struct domain *domain;
- /* If nonzero, a guests pagetable which we are shadowing. */
- paddr_t shadowing;
+ /*
+ * A collection of %cr3's, belonging to @p domain, which are shadowed
+ * locally.
+ *
+ * A cache entry is used if cr3_mfn != 0, free otherwise. The cache is
+ * maintained in most-recently-used order. As a result, cache[0].cr3_mfn
+ * should always match v->arch.cr3.
+ *
+ * The cache[].idx fields will always be unique, and between 0 and
+ * NR_L4_SHADOWS. Their order however will vary as most-recently-used
+ * order is maintained.
+ */
+ pt_cache_entry_t cache[NR_L4_SHADOWS];
};
static DEFINE_PER_CPU(struct pt_shadow, ptsh);
+static l4_pgentry_t *shadow_l4_va(struct pt_shadow *ptsh, unsigned int idx)
+{
+ return _p(ptsh->shadow_l4_va) + idx * PAGE_SIZE;
+}
+
+static paddr_t shadow_l4(struct pt_shadow *ptsh, unsigned int idx)
+{
+ return ptsh->shadow_l4 + idx * PAGE_SIZE;
+}
+
int pt_shadow_alloc(unsigned int cpu)
{
struct pt_shadow *ptsh = &per_cpu(ptsh, cpu);
- unsigned int memflags = 0;
+ unsigned int memflags = 0, i;
nodeid_t node = cpu_to_node(cpu);
struct page_info *pg;
+ mfn_t mfns[NR_L4_SHADOWS];
if ( node != NUMA_NO_NODE )
memflags = MEMF_node(node);
- pg = alloc_domheap_page(NULL, memflags);
+ pg = alloc_domheap_pages(NULL, L4_SHADOW_ORDER, memflags);
if ( !pg )
return -ENOMEM;
ptsh->shadow_l4 = page_to_maddr(pg);
- ptsh->shadow_l4_va = __map_domain_page_global(pg);
+ for ( i = 0; i < ARRAY_SIZE(mfns); ++i )
+ {
+ /* Initialise the cache (ascending idx fields). */
+ ptsh->cache[i] = (pt_cache_entry_t){ i };
+
+ /* Collect MFNs to vmap(). */
+ mfns[i] = mfn_add(maddr_to_mfn(ptsh->shadow_l4), i);
+ }
+
+ ptsh->shadow_l4_va = vmap(mfns, ARRAY_SIZE(mfns));
if ( !ptsh->shadow_l4_va )
return -ENOMEM;
@@ -98,17 +158,35 @@ void pt_shadow_free(unsigned int cpu)
if ( ptsh->shadow_l4_va )
{
- unmap_domain_page_global(ptsh->shadow_l4_va);
+ vunmap(ptsh->shadow_l4_va);
ptsh->shadow_l4_va = NULL;
}
if ( ptsh->shadow_l4 )
{
- free_domheap_page(maddr_to_page(ptsh->shadow_l4));
+ free_domheap_pages(maddr_to_page(ptsh->shadow_l4), L4_SHADOW_ORDER);
ptsh->shadow_l4 = 0;
}
}
+static pt_cache_entry_t *pt_cache_lookup(
+ struct pt_shadow *ptsh, unsigned long maddr)
+{
+ unsigned int i;
+
+ ASSERT(!local_irq_is_enabled());
+
+ for ( i = 0; i < ARRAY_SIZE(ptsh->cache); ++i )
+ {
+ pt_cache_entry_t *ent = &ptsh->cache[i];
+
+ if ( ent->cr3_mfn == (maddr >> PAGE_SHIFT) )
+ return ent;
+ }
+
+ return NULL;
+}
+
/*
* We only need to shadow 4-level PV guests. All other guests have per-vcpu
* monitor tables which are never scheduled on concurrent pcpus. Care needs
@@ -126,6 +204,7 @@ unsigned long pt_maybe_shadow(struct vcpu *v)
unsigned int cpu = smp_processor_id();
struct pt_shadow *ptsh = &per_cpu(ptsh, cpu);
unsigned long flags, new_cr3 = v->arch.cr3;
+ pt_cache_entry_t *ent;
/*
* IPIs for updates are based on the domain dirty mask. If we ever switch
@@ -135,8 +214,12 @@ unsigned long pt_maybe_shadow(struct vcpu *v)
if ( ptsh->domain &&
ptsh->domain != v->domain )
{
+ unsigned int i;
+
ptsh->domain = NULL;
- ptsh->shadowing = 0;
+
+ for ( i = 0; i < ARRAY_SIZE(ptsh->cache); ++i )
+ ptsh->cache[i].cr3_mfn = 0;
}
/* No shadowing necessary? Run on the intended pagetable. */
@@ -145,10 +228,6 @@ unsigned long pt_maybe_shadow(struct vcpu *v)
ptsh->domain = v->domain;
- /* Fastpath, if we are already shadowing the intended pagetable. */
- if ( ptsh->shadowing == new_cr3 )
- return ptsh->shadow_l4;
-
/*
* We may be called with interrupts disabled (e.g. context switch), or
* interrupts enabled (e.g. new_guest_cr3()).
@@ -158,14 +237,46 @@ unsigned long pt_maybe_shadow(struct vcpu *v)
*/
local_irq_save(flags);
+ ent = pt_cache_lookup(ptsh, new_cr3);
+ if ( ent )
+ {
+ /*
+ * Cache hit. Promote this entry to being most recently used (if it
+ * isn't already).
+ */
+ unsigned int cache_idx = ent - ptsh->cache;
+
+ if ( cache_idx )
+ {
+ pt_cache_entry_t tmp = *ent;
+
+ switch ( cache_idx )
+ {
+ case 3: ptsh->cache[3] = ptsh->cache[2];
+ case 2: ptsh->cache[2] = ptsh->cache[1];
+ case 1: ptsh->cache[1] = ptsh->cache[0];
+ ptsh->cache[0] = tmp;
+ }
+ }
+ local_irq_restore(flags);
+ }
+ else
{
+ /*
+ * Cache miss. Recycle whatever was in the last slot, promote it to
+ * being most recently used, and copy the entire pagetable.
+ */
unsigned int slot = l4_table_offset(PERCPU_LINEAR_START);
+ unsigned int idx = ptsh->cache[3].idx;
l4_pgentry_t *l4t, *vcpu_l4t;
- ptsh->shadowing = new_cr3;
+ ptsh->cache[3] = ptsh->cache[2];
+ ptsh->cache[2] = ptsh->cache[1];
+ ptsh->cache[1] = ptsh->cache[0];
+ ptsh->cache[0] = (pt_cache_entry_t){ new_cr3 | idx };
local_irq_restore(flags);
- l4t = ptsh->shadow_l4_va;
+ l4t = shadow_l4_va(ptsh, idx);
vcpu_l4t = map_domain_page(maddr_to_mfn(new_cr3));
/*
@@ -184,7 +295,9 @@ unsigned long pt_maybe_shadow(struct vcpu *v)
unmap_domain_page(vcpu_l4t);
}
- return ptsh->shadow_l4;
+ ASSERT(ptsh->cache[0].cr3_mfn == (new_cr3 >> PAGE_SHIFT));
+
+ return shadow_l4(ptsh, ptsh->cache[0].idx);
}
struct ptsh_ipi_info
@@ -193,6 +306,7 @@ struct ptsh_ipi_info
const struct page_info *pg;
enum {
PTSH_IPI_WRITE,
+ PTSH_IPI_INVLPG,
} op;
unsigned int slot;
};
@@ -202,29 +316,49 @@ static void _pt_shadow_ipi(void *arg)
unsigned int cpu = smp_processor_id();
struct pt_shadow *ptsh = &per_cpu(ptsh, cpu);
const struct ptsh_ipi_info *info = arg;
- unsigned long maddr = page_to_maddr(info->pg);
+ pt_cache_entry_t *ent;
/* No longer shadowing state from this domain? Nothing to do. */
if ( info->d != ptsh->domain )
return;
+ ent = pt_cache_lookup(ptsh, page_to_maddr(info->pg));
+
/* Not shadowing this frame? Nothing to do. */
- if ( ptsh->shadowing != maddr )
+ if ( ent == NULL )
return;
switch ( info->op )
{
l4_pgentry_t *l4t, *vcpu_l4t;
+ unsigned int cache_idx, shadow_idx;
case PTSH_IPI_WRITE:
- l4t = ptsh->shadow_l4_va;
- vcpu_l4t = map_domain_page(maddr_to_mfn(maddr));
+ l4t = shadow_l4_va(ptsh, ent->idx);
+ vcpu_l4t = map_domain_page(page_to_mfn(info->pg));
l4t[info->slot] = vcpu_l4t[info->slot];
unmap_domain_page(vcpu_l4t);
break;
+ case PTSH_IPI_INVLPG:
+ cache_idx = ent - ptsh->cache;
+ shadow_idx = ent->idx;
+
+ /*
+ * Demote the dropped entry to least-recently-used, so it is the next
+ * entry to be reused.
+ */
+ switch ( cache_idx )
+ {
+ case 0: BUG(); /* ??? Freeing the L4 which current is running on! */
+ case 1: ptsh->cache[1] = ptsh->cache[2];
+ case 2: ptsh->cache[2] = ptsh->cache[3];
+ case 3: ptsh->cache[3] = (pt_cache_entry_t){ shadow_idx };
+ }
+ break;
+
default:
ASSERT_UNREACHABLE();
}
@@ -248,6 +382,22 @@ void pt_shadow_l4_write(const struct domain *d, const
struct page_info *pg,
on_selected_cpus(d->domain_dirty_cpumask, _pt_shadow_ipi, &info, 1);
}
+void pt_shadow_l4_invlpg(const struct domain *d, const struct page_info *pg)
+{
+ struct ptsh_ipi_info info;
+
+ if ( !pt_need_shadow(d) )
+ return;
+
+ info = (struct ptsh_ipi_info){
+ .d = d,
+ .pg = pg,
+ .op = PTSH_IPI_INVLPG,
+ };
+
+ on_selected_cpus(d->domain_dirty_cpumask, _pt_shadow_ipi, &info, 1);
+}
+
/*
* Local variables:
* mode: C
diff --git a/xen/include/asm-x86/pv/pt-shadow.h
b/xen/include/asm-x86/pv/pt-shadow.h
index 6e71e99..d5576f4 100644
--- a/xen/include/asm-x86/pv/pt-shadow.h
+++ b/xen/include/asm-x86/pv/pt-shadow.h
@@ -47,6 +47,13 @@ unsigned long pt_maybe_shadow(struct vcpu *v);
void pt_shadow_l4_write(
const struct domain *d, const struct page_info *pg, unsigned int slot);
+/*
+ * Called when an L4 pagetable is freed. The PT shadow logic ensures that it
+ * is purged from any caches.
+ */
+void pt_shadow_l4_invlpg(
+ const struct domain *d, const struct page_info *pg);
+
#else /* !CONFIG_PV */
static inline int pt_shadow_alloc(unsigned int cpu) { return 0; }
@@ -58,6 +65,8 @@ static inline unsigned long pt_maybe_shadow(struct vcpu *v)
}
static inline void pt_shadow_l4_write(
const struct domain *d, const struct page_info *pg, unsigned int slot) { }
+static inline void pt_shadow_l4_invlpg(
+ const struct domain *d, const struct page_info *pg) { }
#endif /* CONFIG_PV */
--
2.1.4
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |