|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH RFC 13/44] x86/pt-shadow: Shadow L4 tables from 64bit PV guests
See the code comments for reasoning and the algorithm description.
This is a very simplistic algorithm, which comes with a substantial
performance overhead. The algorithm will be improved in a later patch, once
more infrastructure is in place.
Some of the code (particularly in pt_maybe_shadow()) is structured oddly.
This is deliberate to simplify the patch for the later algorithm improvement,
to avoid unnecessary code motion getting in the way of the logical change.
Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
---
v3:
* Rebase over change to using ptsh
* Rework, in terms of being as close to the eventual algorithm as possible,
before we get map_domain_page() which is usable in context switch context.
---
xen/arch/x86/mm.c | 5 +-
xen/arch/x86/mm/shadow/multi.c | 2 +
xen/arch/x86/pv/mm.h | 16 +++-
xen/arch/x86/pv/pt-shadow.c | 164 +++++++++++++++++++++++++++++++++++++
xen/include/asm-x86/fixmap.h | 1 +
xen/include/asm-x86/pv/pt-shadow.h | 24 ++++++
6 files changed, 209 insertions(+), 3 deletions(-)
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index f85ef6c..375565f 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -126,6 +126,7 @@
#include <asm/hvm/grant_table.h>
#include <asm/pv/grant_table.h>
#include <asm/pv/mm.h>
+#include <asm/pv/pt-shadow.h>
#include "pv/mm.h"
@@ -501,13 +502,15 @@ DEFINE_PER_CPU(unsigned long, curr_ptbase);
void do_write_ptbase(struct vcpu *v, bool tlb_maintenance)
{
- unsigned long new_cr3 = v->arch.cr3;
+ unsigned long new_cr3;
unsigned int cpu = smp_processor_id();
unsigned long *this_curr_ptbase = &per_cpu(curr_ptbase, cpu);
/* Check that %cr3 isn't being shuffled under our feet. */
ASSERT(*this_curr_ptbase == read_cr3());
+ new_cr3 = pt_maybe_shadow(v);
+
if ( tlb_maintenance )
write_cr3(new_cr3);
else
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index c4e954e..9c929ed 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -39,6 +39,7 @@ asm(".file \"" __OBJECT_FILE__ "\"");
#include <asm/hvm/cacheattr.h>
#include <asm/mtrr.h>
#include <asm/guest_pt.h>
+#include <asm/pv/pt-shadow.h>
#include <public/sched.h>
#include "private.h"
#include "types.h"
@@ -952,6 +953,7 @@ static int shadow_set_l4e(struct domain *d,
/* Write the new entry */
shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
+ pt_shadow_l4_write(d, mfn_to_page(sl4mfn), pgentry_ptr_to_slot(sl4e));
flags |= SHADOW_SET_CHANGED;
if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h
index a10b09a..7c66ca7 100644
--- a/xen/arch/x86/pv/mm.h
+++ b/xen/arch/x86/pv/mm.h
@@ -1,6 +1,8 @@
#ifndef __PV_MM_H__
#define __PV_MM_H__
+#include <asm/pv/pt-shadow.h>
+
l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn);
int new_guest_cr3(mfn_t mfn);
@@ -38,7 +40,7 @@ static inline l1_pgentry_t guest_get_eff_l1e(unsigned long
linear)
*/
static inline bool update_intpte(intpte_t *p, intpte_t old, intpte_t new,
unsigned long mfn, struct vcpu *v,
- bool preserve_ad)
+ bool preserve_ad, unsigned int level)
{
bool rv = true;
@@ -77,6 +79,11 @@ static inline bool update_intpte(intpte_t *p, intpte_t old,
intpte_t new,
old = t;
}
}
+
+ if ( level == 4 )
+ pt_shadow_l4_write(v->domain, mfn_to_page(mfn),
+ pgentry_ptr_to_slot(p));
+
return rv;
}
@@ -87,7 +94,12 @@ static inline bool update_intpte(intpte_t *p, intpte_t old,
intpte_t new,
#define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad) \
update_intpte(&_t ## e_get_intpte(*(_p)), \
_t ## e_get_intpte(_o), _t ## e_get_intpte(_n), \
- (_m), (_v), (_ad))
+ (_m), (_v), (_ad), _t ## _LEVEL)
+
+#define l1_LEVEL 1
+#define l2_LEVEL 2
+#define l3_LEVEL 3
+#define l4_LEVEL 4
static inline l1_pgentry_t adjust_guest_l1e(l1_pgentry_t l1e,
const struct domain *d)
diff --git a/xen/arch/x86/pv/pt-shadow.c b/xen/arch/x86/pv/pt-shadow.c
index 7db8efb..46a0251 100644
--- a/xen/arch/x86/pv/pt-shadow.c
+++ b/xen/arch/x86/pv/pt-shadow.c
@@ -22,8 +22,32 @@
#include <xen/mm.h>
#include <xen/numa.h>
+#include <asm/fixmap.h>
#include <asm/pv/pt-shadow.h>
+/*
+ * To use percpu linear ranges, we require that no two pcpus have %cr3
+ * pointing at the same L4 pagetable at the same time.
+ *
+ * Guests however might choose to use the same L4 pagetable on multiple vcpus
+ * at once, e.g. concurrently scheduling two threads from the same process.
+ * In practice, all HVM guests, and 32bit PV guests run on Xen-provided
+ * per-vcpu monitor tables, so it is only 64bit PV guests which are an issue.
+ *
+ * To resolve the issue, we shadow L4 pagetables from 64bit PV guests when
+ * they are in context.
+ *
+ * The algorithm is fairly simple.
+ *
+ * - When a pcpu is switching to a new vcpu cr3 and shadowing is necessary,
+ * perform a full 4K copy of the guests frame into a percpu frame, and run
+ * on that.
+ * - When a write to a guests L4 pagetable occurs, the update must be
+ * propagated to all existing shadows. An IPI is sent to the domains
+ * dirty mask indicating which frame/slot was updated, and each pcpu
+ * checks to see whether it needs to sync the update into its shadow.
+ */
+
struct pt_shadow {
/*
* A frame used to shadow a vcpus intended pagetable. When shadowing,
@@ -31,6 +55,17 @@ struct pt_shadow {
*/
paddr_t shadow_l4;
l4_pgentry_t *shadow_l4_va;
+
+ /*
+ * Domain to which the shadowed state belongs, or NULL if no state is
+ * being cached. IPIs for updates to cached information are based on the
+ * domain dirty mask, which can race with the target of the IPI switching
+ * to a different context.
+ */
+ const struct domain *domain;
+
+ /* If nonzero, a guests pagetable which we are shadowing. */
+ paddr_t shadowing;
};
static DEFINE_PER_CPU(struct pt_shadow, ptsh);
@@ -76,6 +111,135 @@ void pt_shadow_free(unsigned int cpu)
}
/*
+ * We only need to shadow 4-level PV guests. All other guests have per-vcpu
+ * monitor tables which are never scheduled on concurrent pcpus. Care needs
+ * to be taken not to shadow d0v0 during construction, as it writes its L4
+ * directly.
+ */
+static bool pt_need_shadow(const struct domain *d)
+{
+ return (system_state >= SYS_STATE_active && is_pv_domain(d) &&
+ !is_idle_domain(d) && !is_pv_32bit_domain(d) && d->max_vcpus > 1);
+}
+
+unsigned long pt_maybe_shadow(struct vcpu *v)
+{
+ unsigned int cpu = smp_processor_id();
+ struct pt_shadow *ptsh = &per_cpu(ptsh, cpu);
+ unsigned long flags, new_cr3 = v->arch.cr3;
+
+ /*
+ * IPIs for updates are based on the domain dirty mask. If we ever switch
+ * out of the currently shadowed context (even to idle), the cache will
+ * become stale.
+ */
+ if ( ptsh->domain &&
+ ptsh->domain != v->domain )
+ {
+ ptsh->domain = NULL;
+ ptsh->shadowing = 0;
+ }
+
+ /* No shadowing necessary? Run on the intended pagetable. */
+ if ( !pt_need_shadow(v->domain) )
+ return new_cr3;
+
+ ptsh->domain = v->domain;
+
+ /* Fastpath, if we are already shadowing the intended pagetable. */
+ if ( ptsh->shadowing == new_cr3 )
+ return ptsh->shadow_l4;
+
+ /*
+ * We may be called with interrupts disabled (e.g. context switch), or
+ * interrupts enabled (e.g. new_guest_cr3()).
+ *
+ * Reads and modifications of ptsh-> are only on the local cpu, but must
+ * be excluded against reads and modifications in _pt_shadow_ipi().
+ */
+ local_irq_save(flags);
+
+ {
+ l4_pgentry_t *l4t, *vcpu_l4t;
+
+ set_percpu_fixmap(cpu, PERCPU_FIXSLOT_SHADOW,
+ l1e_from_paddr(new_cr3, __PAGE_HYPERVISOR_RO));
+ ptsh->shadowing = new_cr3;
+ local_irq_restore(flags);
+
+ l4t = ptsh->shadow_l4_va;
+ vcpu_l4t = percpu_fix_to_virt(cpu, PERCPU_FIXSLOT_SHADOW);
+
+ copy_page(l4t, vcpu_l4t);
+ }
+
+ return ptsh->shadow_l4;
+}
+
+struct ptsh_ipi_info
+{
+ const struct domain *d;
+ const struct page_info *pg;
+ enum {
+ PTSH_IPI_WRITE,
+ } op;
+ unsigned int slot;
+};
+
+static void _pt_shadow_ipi(void *arg)
+{
+ unsigned int cpu = smp_processor_id();
+ struct pt_shadow *ptsh = &per_cpu(ptsh, cpu);
+ const struct ptsh_ipi_info *info = arg;
+ unsigned long maddr = page_to_maddr(info->pg);
+
+ /* No longer shadowing state from this domain? Nothing to do. */
+ if ( info->d != ptsh->domain )
+ return;
+
+ /* Not shadowing this frame? Nothing to do. */
+ if ( ptsh->shadowing != maddr )
+ return;
+
+ switch ( info->op )
+ {
+ l4_pgentry_t *l4t, *vcpu_l4t;
+
+ case PTSH_IPI_WRITE:
+ l4t = ptsh->shadow_l4_va;
+
+ /* Reuse the mapping established in pt_maybe_shadow(). */
+ ASSERT(l1e_get_paddr(*percpu_fixmap_l1e(cpu, PERCPU_FIXSLOT_SHADOW)) ==
+ maddr);
+ vcpu_l4t = percpu_fix_to_virt(cpu, PERCPU_FIXSLOT_SHADOW);
+
+ l4t[info->slot] = vcpu_l4t[info->slot];
+ break;
+
+ default:
+ ASSERT_UNREACHABLE();
+ }
+}
+
+void pt_shadow_l4_write(const struct domain *d, const struct page_info *pg,
+ unsigned int slot)
+{
+ struct ptsh_ipi_info info;
+
+ if ( !pt_need_shadow(d) )
+ return;
+
+ info = (struct ptsh_ipi_info){
+ .d = d,
+ .pg = pg,
+ .op = PTSH_IPI_WRITE,
+ .slot = slot,
+ };
+
+ on_selected_cpus(d->domain_dirty_cpumask, _pt_shadow_ipi, &info, 1);
+}
+
+/*
* Local variables:
* mode: C
* c-file-style: "BSD"
diff --git a/xen/include/asm-x86/fixmap.h b/xen/include/asm-x86/fixmap.h
index d46939a..748219f 100644
--- a/xen/include/asm-x86/fixmap.h
+++ b/xen/include/asm-x86/fixmap.h
@@ -28,6 +28,7 @@
#include <acpi/apei.h>
#define NR_PERCPU_SLOTS 1
+#define PERCPU_FIXSLOT_SHADOW 0
/*
* Here we define all the compile-time 'special' virtual
diff --git a/xen/include/asm-x86/pv/pt-shadow.h
b/xen/include/asm-x86/pv/pt-shadow.h
index ff99c85..6e71e99 100644
--- a/xen/include/asm-x86/pv/pt-shadow.h
+++ b/xen/include/asm-x86/pv/pt-shadow.h
@@ -21,6 +21,8 @@
#ifndef __X86_PV_PT_SHADOW_H__
#define __X86_PV_PT_SHADOW_H__
+#include <xen/sched.h>
+
#ifdef CONFIG_PV
/*
@@ -30,11 +32,33 @@
int pt_shadow_alloc(unsigned int cpu);
void pt_shadow_free(unsigned int cpu);
+/*
+ * Called for context switches, and when a vcpu explicitly changes cr3. The
+ * PT shadow logic returns the cr3 hardware should run on, which is either
+ * v->arch.cr3 (no shadowing necessary), or a local frame (which is a suitable
+ * shadow of v->arch.cr3).
+ */
+unsigned long pt_maybe_shadow(struct vcpu *v);
+
+/*
+ * Called when a write occurs to an L4 pagetable. The PT shadow logic brings
+ * any shadows of this page up-to-date.
+ */
+void pt_shadow_l4_write(
+ const struct domain *d, const struct page_info *pg, unsigned int slot);
+
#else /* !CONFIG_PV */
static inline int pt_shadow_alloc(unsigned int cpu) { return 0; }
static inline void pt_shadow_free(unsigned int cpu) { }
+static inline unsigned long pt_maybe_shadow(struct vcpu *v)
+{
+ return v->arch.cr3;
+}
+static inline void pt_shadow_l4_write(
+ const struct domain *d, const struct page_info *pg, unsigned int slot) { }
+
#endif /* CONFIG_PV */
#endif /* __X86_PV_PT_SHADOW_H__ */
--
2.1.4
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |