|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v2 4/6] x86/NPT: don't walk entire page tables when changing types on a range
This builds on the fact that in order for no NPF VM exit to occur,
_PAGE_USER must always be set. I.e. by clearing the flag we can force a
VM exit allowing us to do similar lazy type changes as on EPT.
That way, the generic entry-wise code can go away, and we could remove
the range restriction in enforced on HVMOP_track_dirty_vram for XSA-27.
Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -2557,7 +2557,16 @@ void svm_vmexit_handler(struct cpu_user_
perfc_incra(svmexits, VMEXIT_NPF_PERFC);
if ( cpu_has_svm_decode )
v->arch.hvm_svm.cached_insn_len = vmcb->guest_ins_len & 0xf;
- svm_do_nested_pgfault(v, regs, vmcb->exitinfo1, vmcb->exitinfo2);
+ rc = p2m_npt_fault(vmcb->exitinfo2);
+ if ( rc >= 0 )
+ svm_do_nested_pgfault(v, regs, vmcb->exitinfo1, vmcb->exitinfo2);
+ else
+ {
+ printk(XENLOG_G_ERR
+ "%pv: Error %d handling NPF (gpa=%08lx ec=%04lx)\n",
+ v, rc, vmcb->exitinfo2, vmcb->exitinfo1);
+ domain_crash(v->domain);
+ }
v->arch.hvm_svm.cached_insn_len = 0;
break;
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -728,10 +728,7 @@ void p2m_change_type_range(struct domain
unsigned long start, unsigned long end,
p2m_type_t ot, p2m_type_t nt)
{
- p2m_access_t a;
- p2m_type_t pt;
unsigned long gfn = start;
- mfn_t mfn;
struct p2m_domain *p2m = p2m_get_hostp2m(d);
int rc = 0;
@@ -750,47 +747,8 @@ void p2m_change_type_range(struct domain
}
end = p2m->max_mapped_pfn + 1;
}
-
- if ( gfn < end && p2m->change_entry_type_range )
- {
+ if ( gfn < end )
rc = p2m->change_entry_type_range(p2m, ot, nt, gfn, end - 1);
- gfn = end;
- }
- while ( !rc && gfn < end )
- {
- unsigned int order;
-
- mfn = p2m->get_entry(p2m, gfn, &pt, &a, 0, &order);
- while ( order > PAGE_ORDER_4K )
- {
- unsigned long mask = ~0UL << order;
-
- /*
- * Log-dirty ranges starting/ending in the middle of a super page
- * (with a page split still pending) can't have a consistent type
- * reported for the full range and hence need the split to be
- * enforced here.
- */
- if ( !p2m_is_changeable(pt) ||
- p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) >= 0 )
- {
- if ( pt != ot )
- break;
- if ( !(gfn & ~mask) && end > (gfn | ~mask) )
- break;
- }
- if ( order == PAGE_ORDER_1G )
- order = PAGE_ORDER_2M;
- else
- order = PAGE_ORDER_4K;
- }
- if ( pt == ot )
- rc = p2m_set_entry(p2m, gfn, mfn, order, nt, a);
- gfn += 1UL << order;
- gfn &= -1UL << order;
- if ( !gfn )
- break;
- }
if ( rc )
{
printk(XENLOG_G_ERR "Error %d changing Dom%d GFNs [%lx,%lx] from %d to
%d\n",
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -60,6 +60,19 @@
#define P2M_BASE_FLAGS \
(_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)
+#define RECALC_FLAGS (_PAGE_USER|_PAGE_ACCESSED)
+#define set_recalc(level, ent) level##e_remove_flags(ent, RECALC_FLAGS)
+#define clear_recalc(level, ent) level##e_add_flags(ent, RECALC_FLAGS)
+#define _needs_recalc(flags) (!((flags) & _PAGE_USER))
+#define needs_recalc(level, ent) _needs_recalc(level##e_get_flags(ent))
+#define valid_recalc(level, ent) (!(level##e_get_flags(ent) & _PAGE_ACCESSED))
+
+static const unsigned long pgt[] = {
+ PGT_l1_page_table,
+ PGT_l2_page_table,
+ PGT_l3_page_table
+};
+
static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
{
unsigned long flags;
@@ -272,6 +285,196 @@ p2m_next_level(struct p2m_domain *p2m, v
return 0;
}
+/*
+ * Mark (via clearing the U flag) as needing P2M type re-calculation all valid
+ * present entries at the targeted level for the passed in GFN range, which is
+ * guaranteed to not cross a page (table) boundary at that level.
+ */
+static int p2m_pt_set_recalc_range(struct p2m_domain *p2m,
+ unsigned int level,
+ unsigned long first_gfn,
+ unsigned long last_gfn)
+{
+ void *table;
+ unsigned long gfn_remainder = first_gfn, remainder;
+ unsigned int i;
+ l1_pgentry_t *pent, *plast;
+ int err = 0;
+
+ table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+ for ( i = 4; i-- > level; )
+ {
+ remainder = gfn_remainder;
+ pent = p2m_find_entry(table, &remainder, first_gfn,
+ i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+ if ( !pent )
+ {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if ( !(l1e_get_flags(*pent) & _PAGE_PRESENT) )
+ goto out;
+
+ err = p2m_next_level(p2m, &table, &gfn_remainder, first_gfn,
+ i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER,
+ pgt[i - 1]);
+ if ( err )
+ goto out;
+ }
+
+ remainder = gfn_remainder + (last_gfn - first_gfn);
+ pent = p2m_find_entry(table, &gfn_remainder, first_gfn,
+ i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+ plast = p2m_find_entry(table, &remainder, last_gfn,
+ i * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+ if ( pent && plast )
+ for ( ; pent <= plast; ++pent )
+ {
+ l1_pgentry_t e = *pent;
+
+ if ( (l1e_get_flags(e) & _PAGE_PRESENT) && !needs_recalc(l1, e) )
+ {
+ set_recalc(l1, e);
+ p2m->write_p2m_entry(p2m, first_gfn, pent, e, level);
+ }
+ first_gfn += 1UL << (i * PAGETABLE_ORDER);
+ }
+ else
+ err = -EIO;
+
+ out:
+ unmap_domain_page(table);
+
+ return err;
+}
+
+/*
+ * Handle possibly necessary P2M type re-calculation (U flag clear for a
+ * present entry) for the entries in the page table hierarchy for the given
+ * GFN. Propagate the re-calculation flag down to the next page table level
+ * for entries not involved in the translation of the given GFN.
+ */
+static int do_recalc(struct p2m_domain *p2m, unsigned long gfn)
+{
+ void *table;
+ unsigned long gfn_remainder = gfn;
+ unsigned int level = 4;
+ l1_pgentry_t *pent;
+ int err = 0;
+
+ table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
+ while ( --level )
+ {
+ unsigned long remainder = gfn_remainder;
+
+ pent = p2m_find_entry(table, &remainder, gfn,
+ level * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+ if ( !pent || !(l1e_get_flags(*pent) & _PAGE_PRESENT) )
+ goto out;
+
+ if ( l1e_get_flags(*pent) & _PAGE_PSE )
+ {
+ unsigned long mask = ~0UL << (level * PAGETABLE_ORDER);
+
+ if ( !needs_recalc(l1, *pent) ||
+ !p2m_is_changeable(p2m_flags_to_type(l1e_get_flags(*pent))) ||
+ p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) >= 0 )
+ break;
+ }
+
+ err = p2m_next_level(p2m, &table, &gfn_remainder, gfn,
+ level * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER,
+ pgt[level - 1]);
+ if ( err )
+ goto out;
+
+ if ( needs_recalc(l1, *pent) )
+ {
+ l1_pgentry_t e = *pent, *ptab = table;
+ unsigned int i;
+
+ if ( !valid_recalc(l1, e) )
+ P2M_DEBUG("bogus recalc state at d%d:%lx:%u\n",
+ p2m->domain->domain_id, gfn, level);
+ remainder = gfn_remainder;
+ for ( i = 0; i < (1 << PAGETABLE_ORDER); ++i )
+ {
+ l1_pgentry_t ent = ptab[i];
+
+ if ( (l1e_get_flags(ent) & _PAGE_PRESENT) &&
+ !needs_recalc(l1, ent) )
+ {
+ set_recalc(l1, ent);
+ p2m->write_p2m_entry(p2m, gfn - remainder, &ptab[i],
+ ent, level);
+ }
+ remainder -= 1UL << ((level - 1) * PAGETABLE_ORDER);
+ }
+ smp_wmb();
+ clear_recalc(l1, e);
+ p2m->write_p2m_entry(p2m, gfn, pent, e, level + 1);
+ }
+ }
+
+ pent = p2m_find_entry(table, &gfn_remainder, gfn,
+ level * PAGETABLE_ORDER, 1 << PAGETABLE_ORDER);
+ if ( pent && (l1e_get_flags(*pent) & _PAGE_PRESENT) &&
+ needs_recalc(l1, *pent) )
+ {
+ l1_pgentry_t e = *pent;
+
+ if ( !valid_recalc(l1, e) )
+ P2M_DEBUG("bogus recalc leaf at d%d:%lx:%u\n",
+ p2m->domain->domain_id, gfn, level);
+ if ( p2m_is_changeable(p2m_flags_to_type(l1e_get_flags(e))) )
+ {
+ unsigned long mask = ~0UL << (level * PAGETABLE_ORDER);
+ p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn |
~mask)
+ ? p2m_ram_logdirty : p2m_ram_rw;
+ unsigned long mfn = l1e_get_pfn(e);
+ unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn));
+
+ if ( level )
+ {
+ if ( flags & _PAGE_PAT )
+ {
+ BUILD_BUG_ON(_PAGE_PAT != _PAGE_PSE);
+ mfn |= _PAGE_PSE_PAT >> PAGE_SHIFT;
+ }
+ else
+ mfn &= ~(_PAGE_PSE_PAT >> PAGE_SHIFT);
+ flags |= _PAGE_PSE;
+ }
+ e = l1e_from_pfn(mfn, flags);
+ p2m_add_iommu_flags(&e, level,
+ (p2mt == p2m_ram_rw)
+ ? IOMMUF_readable|IOMMUF_writable : 0);
+ ASSERT(!needs_recalc(l1, e));
+ }
+ else
+ clear_recalc(l1, e);
+ p2m->write_p2m_entry(p2m, gfn, pent, e, level + 1);
+ }
+
+ out:
+ unmap_domain_page(table);
+
+ return err;
+}
+
+int p2m_npt_fault(uint64_t gpa)
+{
+ struct p2m_domain *p2m = p2m_get_hostp2m(current->domain);
+ int rc;
+
+ p2m_lock(p2m);
+ rc = do_recalc(p2m, PFN_DOWN(gpa));
+ p2m_unlock(p2m);
+
+ return rc;
+}
+
/* Returns: 0 for success, -errno for failure */
static int
p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
@@ -307,6 +510,11 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
__trace_var(TRC_MEM_SET_P2M_ENTRY, 0, sizeof(t), &t);
}
+ /* Carry out any eventually pending earlier changes first. */
+ rc = do_recalc(p2m, gfn);
+ if ( rc < 0 )
+ return rc;
+
table = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m))));
rc = p2m_next_level(p2m, &table, &gfn_remainder, gfn,
L4_PAGETABLE_SHIFT - PAGE_SHIFT,
@@ -459,6 +667,15 @@ p2m_pt_set_entry(struct p2m_domain *p2m,
return rc;
}
+static inline p2m_type_t recalc_type(bool_t recalc, p2m_type_t t,
+ struct p2m_domain *p2m, unsigned long gfn)
+{
+ if ( !recalc || !p2m_is_changeable(t) )
+ return t;
+ return p2m_is_logdirty_range(p2m, gfn, gfn) ? p2m_ram_logdirty
+ : p2m_ram_rw;
+}
+
static mfn_t
p2m_pt_get_entry(struct p2m_domain *p2m, unsigned long gfn,
p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
@@ -468,8 +685,9 @@ p2m_pt_get_entry(struct p2m_domain *p2m,
paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
l2_pgentry_t *l2e;
l1_pgentry_t *l1e;
- unsigned long l1e_flags;
+ unsigned int flags;
p2m_type_t l1t;
+ bool_t recalc;
ASSERT(paging_mode_translate(p2m->domain));
@@ -496,15 +714,17 @@ p2m_pt_get_entry(struct p2m_domain *p2m,
return _mfn(INVALID_MFN);
}
mfn = _mfn(l4e_get_pfn(*l4e));
+ recalc = needs_recalc(l4, *l4e);
unmap_domain_page(l4e);
}
{
l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
l3e += l3_table_offset(addr);
pod_retry_l3:
- if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+ flags = l3e_get_flags(*l3e);
+ if ( !(flags & _PAGE_PRESENT) )
{
- if ( p2m_flags_to_type(l3e_get_flags(*l3e)) ==
p2m_populate_on_demand )
+ if ( p2m_flags_to_type(flags) == p2m_populate_on_demand )
{
if ( q & P2M_ALLOC )
{
@@ -518,12 +738,13 @@ pod_retry_l3:
unmap_domain_page(l3e);
return _mfn(INVALID_MFN);
}
- else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) )
+ if ( flags & _PAGE_PSE )
{
mfn = _mfn(l3e_get_pfn(*l3e) +
l2_table_offset(addr) * L1_PAGETABLE_ENTRIES +
l1_table_offset(addr));
- *t = p2m_flags_to_type(l3e_get_flags(*l3e));
+ *t = recalc_type(recalc || _needs_recalc(flags),
+ p2m_flags_to_type(flags), p2m, gfn);
unmap_domain_page(l3e);
ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
@@ -533,6 +754,8 @@ pod_retry_l3:
}
mfn = _mfn(l3e_get_pfn(*l3e));
+ if ( _needs_recalc(flags) )
+ recalc = 1;
unmap_domain_page(l3e);
}
@@ -540,10 +763,11 @@ pod_retry_l3:
l2e += l2_table_offset(addr);
pod_retry_l2:
- if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
+ flags = l2e_get_flags(*l2e);
+ if ( !(flags & _PAGE_PRESENT) )
{
/* PoD: Try to populate a 2-meg chunk */
- if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand )
+ if ( p2m_flags_to_type(flags) == p2m_populate_on_demand )
{
if ( q & P2M_ALLOC ) {
if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_2M, q) )
@@ -555,10 +779,11 @@ pod_retry_l2:
unmap_domain_page(l2e);
return _mfn(INVALID_MFN);
}
- else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) )
+ if ( flags & _PAGE_PSE )
{
mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr));
- *t = p2m_flags_to_type(l2e_get_flags(*l2e));
+ *t = recalc_type(recalc || _needs_recalc(flags),
+ p2m_flags_to_type(flags), p2m, gfn);
unmap_domain_page(l2e);
ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
@@ -568,14 +793,16 @@ pod_retry_l2:
}
mfn = _mfn(l2e_get_pfn(*l2e));
+ if ( needs_recalc(l2, *l2e) )
+ recalc = 1;
unmap_domain_page(l2e);
l1e = map_domain_page(mfn_x(mfn));
l1e += l1_table_offset(addr);
pod_retry_l1:
- l1e_flags = l1e_get_flags(*l1e);
- l1t = p2m_flags_to_type(l1e_flags);
- if ( ((l1e_flags & _PAGE_PRESENT) == 0) && (!p2m_is_paging(l1t)) )
+ flags = l1e_get_flags(*l1e);
+ l1t = p2m_flags_to_type(flags);
+ if ( !(flags & _PAGE_PRESENT) && !p2m_is_paging(l1t) )
{
/* PoD: Try to populate */
if ( l1t == p2m_populate_on_demand )
@@ -591,7 +818,7 @@ pod_retry_l1:
return _mfn(INVALID_MFN);
}
mfn = _mfn(l1e_get_pfn(*l1e));
- *t = l1t;
+ *t = recalc_type(recalc || _needs_recalc(flags), l1t, p2m, gfn);
unmap_domain_page(l1e);
ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t) || p2m_is_paging(*t));
@@ -714,6 +941,47 @@ static void p2m_pt_change_entry_type_glo
unmap_domain_page(l4e);
}
+static int p2m_pt_change_entry_type_range(struct p2m_domain *p2m,
+ p2m_type_t ot, p2m_type_t nt,
+ unsigned long first_gfn,
+ unsigned long last_gfn)
+{
+ unsigned long mask = (1 << PAGETABLE_ORDER) - 1;
+ unsigned int i;
+ int err = 0;
+
+ ASSERT(hap_enabled(p2m->domain));
+
+ for ( i = 1; i <= 4; )
+ {
+ if ( first_gfn & mask )
+ {
+ unsigned long end_gfn = min(first_gfn | mask, last_gfn);
+
+ err = p2m_pt_set_recalc_range(p2m, i, first_gfn, end_gfn);
+ if ( err || end_gfn >= last_gfn )
+ break;
+ first_gfn = end_gfn + 1;
+ }
+ else if ( (last_gfn & mask) != mask )
+ {
+ unsigned long start_gfn = max(first_gfn, last_gfn & ~mask);
+
+ err = p2m_pt_set_recalc_range(p2m, i, start_gfn, last_gfn);
+ if ( err || start_gfn <= first_gfn )
+ break;
+ last_gfn = start_gfn - 1;
+ }
+ else
+ {
+ ++i;
+ mask |= mask << PAGETABLE_ORDER;
+ }
+ }
+
+ return err;
+}
+
#if P2M_AUDIT
long p2m_pt_audit_p2m(struct p2m_domain *p2m)
{
@@ -872,6 +1140,7 @@ void p2m_pt_init(struct p2m_domain *p2m)
p2m->set_entry = p2m_pt_set_entry;
p2m->get_entry = p2m_pt_get_entry;
p2m->change_entry_type_global = p2m_pt_change_entry_type_global;
+ p2m->change_entry_type_range = p2m_pt_change_entry_type_range;
p2m->write_p2m_entry = paging_write_p2m_entry;
#if P2M_AUDIT
p2m->audit_p2m = p2m_pt_audit_p2m;
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -668,6 +668,8 @@ static inline p2m_type_t p2m_flags_to_ty
return (flags >> 12) & 0x7f;
}
+int p2m_npt_fault(uint64_t gpa);
+
/*
* Nested p2m: shadow p2m tables used for nested HVM virtualization
*/
Attachment:
NPT-implement-cetr.patch _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |