x86/EPT: force re-evaluation of memory type as necessary The main goal here is to drop the bogus dependency of epte_get_entry_emt() on d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT]. Any change to state influencing epte_get_entry_emt()'s decision needs to result in re-calculation. Do this by using the EPT_MISCONFIG VM exit, storing an invalid memory type into EPT's emt field (leaving the IOMMU, which doesn't care about memory types, unaffected). This is being done in a hierarchical manner to keep execution time down: Initially only the top level directory gets invalidated this way. Upon access, the involved intermediate page table levels get cleared back to zero, and the leaf entry gets its field properly set. For 4k leaves all other entries in the same directory also get processed to amortize the cost of the extra VM exit (which halved the number of these VM exits in my testing). This restoring can result in spurious EPT_MISCONFIG VM exits (since two vCPU-s may access addresses involving identical page table structures). Rather than simply returning in such cases (and risking that such a VM exit results from a real mis-configuration, which would then result in an endless loop rather than killing the VM), a per-vCPU flag is being introduced indicating when such a spurious VM exit might validly happen - if another one occurs right after VM re- entry, the flag would generally end up being clear, causing the VM to be killed as before on such VM exits. Note that putting a reserved memory type value in the EPT structures isn't formally sanctioned by the specification. Intel isn't willing to adjust the specification to make this or a similar use of the EPT_MISCONFIG VM exit formally possible, but they have indicated that us using this is low risk wrt forward compatibility. Signed-off-by: Jan Beulich --- a/xen/arch/x86/domctl.c +++ b/xen/arch/x86/domctl.c @@ -84,6 +84,8 @@ long arch_do_domctl( ret = ioports_permit_access(d, fp, fp + np - 1); else ret = ioports_deny_access(d, fp, fp + np - 1); + if ( !ret ) + memory_type_changed(d); } break; @@ -705,6 +707,8 @@ long arch_do_domctl( ret, add ? "removing" : "denying", d->domain_id, mfn, mfn + nr_mfns - 1); } + /* Do this unconditionally to cover errors on above failure paths. */ + memory_type_changed(d); } break; @@ -791,6 +795,8 @@ long arch_do_domctl( "ioport_map: error %ld denying dom%d access to [%x,%x]\n", ret, d->domain_id, fmp, fmp + np - 1); } + if ( !ret ) + memory_type_changed(d); } break; --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -252,6 +252,9 @@ int hvm_set_guest_pat(struct vcpu *v, u6 if ( !hvm_funcs.set_guest_pat(v, guest_pat) ) v->arch.hvm_vcpu.pat_cr = guest_pat; + + memory_type_changed(v->domain); + return 1; } --- a/xen/arch/x86/hvm/mtrr.c +++ b/xen/arch/x86/hvm/mtrr.c @@ -431,8 +431,12 @@ bool_t mtrr_def_type_msr_set(struct doma return 0; } - m->enabled = enabled; - m->def_type = def_type; + if ( m->enabled != enabled || m->def_type != def_type ) + { + m->enabled = enabled; + m->def_type = def_type; + memory_type_changed(d); + } return 1; } @@ -452,6 +456,7 @@ bool_t mtrr_fix_range_msr_set(struct dom return 0; fixed_range_base[row] = msr_content; + memory_type_changed(d); } return 1; @@ -496,6 +501,8 @@ bool_t mtrr_var_range_msr_set( m->overlapped = is_var_mtrr_overlapped(m); + memory_type_changed(d); + return 1; } @@ -690,6 +697,12 @@ static int hvm_load_mtrr_msr(struct doma HVM_REGISTER_SAVE_RESTORE(MTRR, hvm_save_mtrr_msr, hvm_load_mtrr_msr, 1, HVMSR_PER_VCPU); +void memory_type_changed(struct domain *d) +{ + if ( iommu_enabled && !iommu_snoop && d->vcpu && d->vcpu[0] ) + p2m_memory_type_changed(d); +} + uint8_t epte_get_entry_emt(struct domain *d, unsigned long gfn, mfn_t mfn, uint8_t *ipat, bool_t direct_mmio) { @@ -733,8 +746,7 @@ uint8_t epte_get_entry_emt(struct domain return MTRR_TYPE_WRBACK; } - gmtrr_mtype = is_hvm_domain(d) && v && - d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT] ? + gmtrr_mtype = is_hvm_domain(d) && v ? get_mtrr_type(&v->arch.hvm_vcpu.mtrr, (gfn << PAGE_SHIFT)) : MTRR_TYPE_WRBACK; --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -3016,6 +3016,16 @@ void vmx_vmexit_handler(struct cpu_user_ break; } + case EXIT_REASON_EPT_MISCONFIG: + { + paddr_t gpa; + + __vmread(GUEST_PHYSICAL_ADDRESS, &gpa); + if ( !ept_handle_misconfig(gpa) ) + goto exit_and_crash; + break; + } + case EXIT_REASON_MONITOR_TRAP_FLAG: v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG; vmx_update_cpu_exec_control(v); --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -200,6 +200,18 @@ void p2m_change_entry_type_global(struct p2m_unlock(p2m); } +void p2m_memory_type_changed(struct domain *d) +{ + struct p2m_domain *p2m = p2m_get_hostp2m(d); + + if ( p2m->memory_type_changed ) + { + p2m_lock(p2m); + p2m->memory_type_changed(p2m); + p2m_unlock(p2m); + } +} + mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t *a, p2m_query_t q, unsigned int *page_order, bool_t locked) --- a/xen/arch/x86/mm/p2m-ept.c +++ b/xen/arch/x86/mm/p2m-ept.c @@ -660,6 +660,135 @@ static void ept_change_entry_type_global ept_sync_domain(p2m); } +static bool_t ept_invalidate_emt(mfn_t mfn) +{ + ept_entry_t *epte = map_domain_page(mfn_x(mfn)); + unsigned int i; + bool_t changed = 0; + + for ( i = 0; i < EPT_PAGETABLE_ENTRIES; i++ ) + { + ept_entry_t e = atomic_read_ept_entry(&epte[i]); + + if ( !is_epte_valid(&e) || !is_epte_present(&e) || + e.emt == MTRR_NUM_TYPES ) + continue; + + e.emt = MTRR_NUM_TYPES; + atomic_write_ept_entry(&epte[i], e); + changed = 1; + } + + unmap_domain_page(epte); + + return changed; +} + +static void ept_memory_type_changed(struct p2m_domain *p2m) +{ + unsigned long mfn = ept_get_asr(&p2m->ept); + + if ( !mfn ) + return; + + if ( ept_invalidate_emt(_mfn(mfn)) ) + ept_sync_domain(p2m); +} + +bool_t ept_handle_misconfig(uint64_t gpa) +{ + struct vcpu *curr = current; + struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain); + struct ept_data *ept = &p2m->ept; + unsigned int level = ept_get_wl(ept); + unsigned long gfn = PFN_DOWN(gpa); + unsigned long mfn = ept_get_asr(ept); + ept_entry_t *epte; + int okay; + + if ( !mfn ) + return 0; + + p2m_lock(p2m); + + okay = -curr->arch.hvm_vmx.ept_spurious_misconfig; + for ( ; ; --level) + { + ept_entry_t e; + unsigned int i; + + epte = map_domain_page(mfn); + i = (gfn >> (level * EPT_TABLE_ORDER)) & (EPT_PAGETABLE_ENTRIES - 1); + e = atomic_read_ept_entry(&epte[i]); + + if ( level == 0 || is_epte_superpage(&e) ) + { + uint8_t ipat = 0; + + if ( e.emt != MTRR_NUM_TYPES ) + break; + + if ( level == 0 ) + { + for ( gfn -= i, i = 0; i < EPT_PAGETABLE_ENTRIES; ++i ) + { + e = atomic_read_ept_entry(&epte[i]); + if ( e.emt == MTRR_NUM_TYPES ) + e.emt = 0; + if ( !is_epte_valid(&e) || !is_epte_present(&e) ) + continue; + e.emt = epte_get_entry_emt(p2m->domain, gfn + i, + _mfn(e.mfn), &ipat, + e.sa_p2mt == p2m_mmio_direct); + e.ipat = ipat; + atomic_write_ept_entry(&epte[i], e); + } + } + else + { + e.emt = epte_get_entry_emt(p2m->domain, gfn, _mfn(e.mfn), + &ipat, + e.sa_p2mt == p2m_mmio_direct); + e.ipat = ipat; + atomic_write_ept_entry(&epte[i], e); + } + + okay = 1; + break; + } + + if ( e.emt == MTRR_NUM_TYPES ) + { + ASSERT(is_epte_present(&e)); + ept_invalidate_emt(_mfn(e.mfn)); + e.emt = 0; + atomic_write_ept_entry(&epte[i], e); + unmap_domain_page(epte); + okay = 1; + } + else if ( is_epte_present(&e) && !e.emt ) + unmap_domain_page(epte); + else + break; + + mfn = e.mfn; + } + + unmap_domain_page(epte); + if ( okay > 0 ) + { + struct vcpu *v; + + for_each_vcpu ( curr->domain, v ) + v->arch.hvm_vmx.ept_spurious_misconfig = 1; + } + curr->arch.hvm_vmx.ept_spurious_misconfig = 0; + ept_sync_domain(p2m); + p2m_unlock(p2m); + + return !!okay; +} + static void __ept_sync_domain(void *info) { struct ept_data *ept = &((struct p2m_domain *)info)->ept; @@ -697,6 +826,7 @@ int ept_p2m_init(struct p2m_domain *p2m) p2m->set_entry = ept_set_entry; p2m->get_entry = ept_get_entry; p2m->change_entry_type_global = ept_change_entry_type_global; + p2m->memory_type_changed = ept_memory_type_changed; p2m->audit_p2m = NULL; /* Set the memory type used when accessing EPT paging structures. */ @@ -737,6 +867,7 @@ static void ept_dump_p2m_table(unsigned [MTRR_TYPE_WRTHROUGH] = "WT", [MTRR_TYPE_WRPROT] = "WP", [MTRR_TYPE_WRBACK] = "WB", + [MTRR_NUM_TYPES] = "??" }; for_each_domain(d) @@ -750,11 +881,16 @@ static void ept_dump_p2m_table(unsigned for ( gfn = 0; gfn <= p2m->max_mapped_pfn; gfn += 1UL << order ) { + char c = 0; + gfn_remainder = gfn; table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m))); for ( i = ept_get_wl(ept); i > 0; i-- ) { + ept_entry = table + (gfn_remainder >> (i * EPT_TABLE_ORDER)); + if ( ept_entry->emt == MTRR_NUM_TYPES ) + c = '?'; ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i); if ( ret != GUEST_TABLE_NORMAL_PAGE ) break; @@ -775,7 +911,7 @@ static void ept_dump_p2m_table(unsigned memory_types[ept_entry->emt][0], memory_types[ept_entry->emt][1] ?: ept_entry->emt + '0', - ept_entry->ipat ? '!' : ' '); + c ?: ept_entry->ipat ? '!' : ' '); if ( !(record_counter++ % 100) ) process_pending_softirqs(); --- a/xen/common/domctl.c +++ b/xen/common/domctl.c @@ -815,6 +815,10 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1); else ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1); +#ifdef CONFIG_X86 + if ( !ret ) + memory_type_changed(d); +#endif } break; --- a/xen/include/asm-x86/hvm/vmx/vmcs.h +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h @@ -124,6 +124,9 @@ struct arch_vmx_struct { unsigned long host_cr0; + /* Do we need to tolerate a spurious EPT_MISCONFIG VM exit? */ + bool_t ept_spurious_misconfig; + /* Is the guest in real mode? */ uint8_t vmx_realmode; /* Are we emulating rather than VMENTERing? */ --- a/xen/include/asm-x86/hvm/vmx/vmx.h +++ b/xen/include/asm-x86/hvm/vmx/vmx.h @@ -520,6 +520,7 @@ int ept_p2m_init(struct p2m_domain *p2m) void ept_p2m_uninit(struct p2m_domain *p2m); void ept_walk_table(struct domain *d, unsigned long gfn); +bool_t ept_handle_misconfig(uint64_t gpa); void setup_ept_dump(void); void update_guest_eip(void); --- a/xen/include/asm-x86/mtrr.h +++ b/xen/include/asm-x86/mtrr.h @@ -88,6 +88,7 @@ extern bool_t mtrr_fix_range_msr_set(str uint32_t row, uint64_t msr_content); extern bool_t mtrr_def_type_msr_set(struct domain *, struct mtrr_state *, uint64_t msr_content); +extern void memory_type_changed(struct domain *); extern bool_t pat_msr_set(uint64_t *pat, uint64_t msr); bool_t is_var_mtrr_overlapped(struct mtrr_state *m); --- a/xen/include/asm-x86/p2m.h +++ b/xen/include/asm-x86/p2m.h @@ -233,6 +233,7 @@ struct p2m_domain { void (*change_entry_type_global)(struct p2m_domain *p2m, p2m_type_t ot, p2m_type_t nt); + void (*memory_type_changed)(struct p2m_domain *p2m); void (*write_p2m_entry)(struct p2m_domain *p2m, unsigned long gfn, l1_pgentry_t *p, @@ -506,6 +507,9 @@ void p2m_change_type_range(struct domain p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn, p2m_type_t ot, p2m_type_t nt); +/* Report a change affecting memory types. */ +void p2m_memory_type_changed(struct domain *d); + /* Set mmio addresses in the p2m table (for pass-through) */ int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn); int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn);