This requires setting up g_mcg_cap using a white-listing bit mask. Should we also save/restore MCG_CTL and MCi_CTL? --- a/tools/libxc/xc_domain_save.c +++ b/tools/libxc/xc_domain_save.c @@ -1879,6 +1879,7 @@ int xc_domain_save(xc_interface *xch, in domctl.cmd = XEN_DOMCTL_get_ext_vcpucontext; domctl.domain = dom; + memset(&domctl.u, 0, sizeof(domctl.u)); domctl.u.ext_vcpucontext.vcpu = i; if ( xc_domctl(xch, &domctl) < 0 ) { --- a/tools/misc/xen-hvmctx.c +++ b/tools/misc/xen-hvmctx.c @@ -383,6 +383,13 @@ static void dump_viridian_vcpu(void) (unsigned long long) p.apic_assist); } +static void dump_vmce_vcpu(void) +{ + HVM_SAVE_TYPE(VMCE_VCPU) p; + READ(p); + printf(" VMCE_VCPU: caps %" PRIx64 "\n", p.caps); +} + int main(int argc, char **argv) { int entry, domid; @@ -449,6 +456,7 @@ int main(int argc, char **argv) case HVM_SAVE_CODE(MTRR): dump_mtrr(); break; case HVM_SAVE_CODE(VIRIDIAN_DOMAIN): dump_viridian_domain(); break; case HVM_SAVE_CODE(VIRIDIAN_VCPU): dump_viridian_vcpu(); break; + case HVM_SAVE_CODE(VMCE_VCPU): dump_vmce_vcpu(); break; case HVM_SAVE_CODE(END): break; default: printf(" ** Don't understand type %u: skipping\n", --- a/xen/arch/x86/cpu/mcheck/mce.h +++ b/xen/arch/x86/cpu/mcheck/mce.h @@ -3,6 +3,7 @@ #define _MCE_H #include +#include #include #include #include @@ -54,8 +55,8 @@ int unmmap_broken_page(struct domain *d, u64 mce_cap_init(void); extern unsigned int firstbank; -int intel_mce_rdmsr(uint32_t msr, uint64_t *val); -int intel_mce_wrmsr(uint32_t msr, uint64_t val); +int intel_mce_rdmsr(const struct vcpu *, uint32_t msr, uint64_t *val); +int intel_mce_wrmsr(struct vcpu *, uint32_t msr, uint64_t val); struct mcinfo_extended *intel_get_extended_msrs( struct mcinfo_global *mig, struct mc_info *mi); @@ -171,18 +172,20 @@ int vmce_domain_inject(struct mcinfo_ban extern int vmce_init(struct cpuinfo_x86 *c); -static inline int mce_vendor_bank_msr(uint32_t msr) +static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t msr) { if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - msr >= MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 + nr_mce_banks) ) + msr >= MSR_IA32_MC0_CTL2 && + msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) ) return 1; return 0; } -static inline int mce_bank_msr(uint32_t msr) +static inline int mce_bank_msr(const struct vcpu *v, uint32_t msr) { - if ( (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MCx_CTL(nr_mce_banks)) || - mce_vendor_bank_msr(msr) ) + if ( (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MCx_CTL(v->arch.mcg_cap & MCG_CAP_COUNT)) || + mce_vendor_bank_msr(v, msr) ) return 1; return 0; } --- a/xen/arch/x86/cpu/mcheck/mce_intel.c +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c @@ -1421,11 +1421,12 @@ enum mcheck_type intel_mcheck_init(struc } /* intel specific MCA MSR */ -int intel_mce_wrmsr(uint32_t msr, uint64_t val) +int intel_mce_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) { int ret = 0; - if (msr >= MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 + nr_mce_banks)) + if ( msr >= MSR_IA32_MC0_CTL2 && + msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) ) { mce_printk(MCE_QUIET, "We have disabled CMCI capability, " "Guest should not write this MSR!\n"); @@ -1435,11 +1436,12 @@ int intel_mce_wrmsr(uint32_t msr, uint64 return ret; } -int intel_mce_rdmsr(uint32_t msr, uint64_t *val) +int intel_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) { int ret = 0; - if (msr >= MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 + nr_mce_banks)) + if ( msr >= MSR_IA32_MC0_CTL2 && + msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) ) { mce_printk(MCE_QUIET, "We have disabled CMCI capability, " "Guest should not read this MSR!\n"); --- a/xen/arch/x86/cpu/mcheck/vmce.c +++ b/xen/arch/x86/cpu/mcheck/vmce.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -42,7 +43,6 @@ int vmce_init_msr(struct domain *d) nr_mce_banks * sizeof(*dom_vmce(d)->mci_ctl)); dom_vmce(d)->mcg_status = 0x0; - dom_vmce(d)->mcg_cap = g_mcg_cap; dom_vmce(d)->mcg_ctl = ~(uint64_t)0x0; dom_vmce(d)->nr_injection = 0; @@ -61,21 +61,41 @@ void vmce_destroy_msr(struct domain *d) dom_vmce(d) = NULL; } -static int bank_mce_rdmsr(struct domain *d, uint32_t msr, uint64_t *val) +void vmce_init_vcpu(struct vcpu *v) { - int bank, ret = 1; - struct domain_mca_msrs *vmce = dom_vmce(d); + v->arch.mcg_cap = g_mcg_cap; +} + +int vmce_restore_vcpu(struct vcpu *v, uint64_t caps) +{ + if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P ) + { + dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA capabilities" + " %#" PRIx64 " for d%d:v%u (supported: %#Lx)\n", + is_hvm_vcpu(v) ? "HVM" : "PV", caps, v->domain->domain_id, + v->vcpu_id, g_mcg_cap & ~MCG_CAP_COUNT); + return -EPERM; + } + + v->arch.mcg_cap = caps; + return 0; +} + +static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) +{ + int ret = 1; + unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4; + struct domain_mca_msrs *vmce = dom_vmce(v->domain); struct bank_entry *entry; - bank = (msr - MSR_IA32_MC0_CTL) / 4; - if ( bank >= nr_mce_banks ) - return -1; + *val = 0; switch ( msr & (MSR_IA32_MC0_CTL | 3) ) { case MSR_IA32_MC0_CTL: - *val = vmce->mci_ctl[bank] & - (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL); + if ( bank < nr_mce_banks ) + *val = vmce->mci_ctl[bank] & + (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL); mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n", bank, *val); break; @@ -126,7 +146,7 @@ static int bank_mce_rdmsr(struct domain switch ( boot_cpu_data.x86_vendor ) { case X86_VENDOR_INTEL: - ret = intel_mce_rdmsr(msr, val); + ret = intel_mce_rdmsr(v, msr, val); break; default: ret = 0; @@ -145,13 +165,13 @@ static int bank_mce_rdmsr(struct domain */ int vmce_rdmsr(uint32_t msr, uint64_t *val) { - struct domain *d = current->domain; - struct domain_mca_msrs *vmce = dom_vmce(d); + const struct vcpu *cur = current; + struct domain_mca_msrs *vmce = dom_vmce(cur->domain); int ret = 1; *val = 0; - spin_lock(&dom_vmce(d)->lock); + spin_lock(&vmce->lock); switch ( msr ) { @@ -162,39 +182,38 @@ int vmce_rdmsr(uint32_t msr, uint64_t *v "MCE: rdmsr MCG_STATUS 0x%"PRIx64"\n", *val); break; case MSR_IA32_MCG_CAP: - *val = vmce->mcg_cap; + *val = cur->arch.mcg_cap; mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n", *val); break; case MSR_IA32_MCG_CTL: /* Always 0 if no CTL support */ - *val = vmce->mcg_ctl & h_mcg_ctl; + if ( cur->arch.mcg_cap & MCG_CTL_P ) + *val = vmce->mcg_ctl & h_mcg_ctl; mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n", *val); break; default: - ret = mce_bank_msr(msr) ? bank_mce_rdmsr(d, msr, val) : 0; + ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0; break; } - spin_unlock(&dom_vmce(d)->lock); + spin_unlock(&vmce->lock); return ret; } -static int bank_mce_wrmsr(struct domain *d, u32 msr, u64 val) +static int bank_mce_wrmsr(struct vcpu *v, u32 msr, u64 val) { - int bank, ret = 1; - struct domain_mca_msrs *vmce = dom_vmce(d); + int ret = 1; + unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4; + struct domain_mca_msrs *vmce = dom_vmce(v->domain); struct bank_entry *entry = NULL; - bank = (msr - MSR_IA32_MC0_CTL) / 4; - if ( bank >= nr_mce_banks ) - return -EINVAL; - switch ( msr & (MSR_IA32_MC0_CTL | 3) ) { case MSR_IA32_MC0_CTL: - vmce->mci_ctl[bank] = val; + if ( bank < nr_mce_banks ) + vmce->mci_ctl[bank] = val; break; case MSR_IA32_MC0_STATUS: /* Give the first entry of the list, it corresponds to current @@ -202,9 +221,9 @@ static int bank_mce_wrmsr(struct domain * the guest, this node will be deleted. * Only error bank is written. Non-error banks simply return. */ - if ( !list_empty(&dom_vmce(d)->impact_header) ) + if ( !list_empty(&vmce->impact_header) ) { - entry = list_entry(dom_vmce(d)->impact_header.next, + entry = list_entry(vmce->impact_header.next, struct bank_entry, list); if ( entry->bank == bank ) entry->mci_status = val; @@ -228,7 +247,7 @@ static int bank_mce_wrmsr(struct domain switch ( boot_cpu_data.x86_vendor ) { case X86_VENDOR_INTEL: - ret = intel_mce_wrmsr(msr, val); + ret = intel_mce_wrmsr(v, msr, val); break; default: ret = 0; @@ -247,9 +266,9 @@ static int bank_mce_wrmsr(struct domain */ int vmce_wrmsr(u32 msr, u64 val) { - struct domain *d = current->domain; + struct vcpu *cur = current; struct bank_entry *entry = NULL; - struct domain_mca_msrs *vmce = dom_vmce(d); + struct domain_mca_msrs *vmce = dom_vmce(cur->domain); int ret = 1; if ( !g_mcg_cap ) @@ -266,7 +285,7 @@ int vmce_wrmsr(u32 msr, u64 val) vmce->mcg_status = val; mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", val); /* For HVM guest, this is the point for deleting vMCE injection node */ - if ( d->is_hvm && (vmce->nr_injection > 0) ) + if ( is_hvm_vcpu(cur) && (vmce->nr_injection > 0) ) { vmce->nr_injection--; /* Should be 0 */ if ( !list_empty(&vmce->impact_header) ) @@ -293,7 +312,7 @@ int vmce_wrmsr(u32 msr, u64 val) ret = -1; break; default: - ret = mce_bank_msr(msr) ? bank_mce_wrmsr(d, msr, val) : 0; + ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) : 0; break; } @@ -301,6 +320,46 @@ int vmce_wrmsr(u32 msr, u64 val) return ret; } +static int vmce_save_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h) +{ + struct vcpu *v; + int err = 0; + + for_each_vcpu( d, v ) { + struct hvm_vmce_vcpu ctxt = { + .caps = v->arch.mcg_cap + }; + + err = hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt); + if ( err ) + break; + } + + return err; +} + +static int vmce_load_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h) +{ + unsigned int vcpuid = hvm_load_instance(h); + struct vcpu *v; + struct hvm_vmce_vcpu ctxt; + int err; + + if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) + { + dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n", + d->domain_id, vcpuid); + err = -EINVAL; + } + else + err = hvm_load_entry(VMCE_VCPU, h, &ctxt); + + return err ?: vmce_restore_vcpu(v, ctxt.caps); +} + +HVM_REGISTER_SAVE_RESTORE(VMCE_VCPU, vmce_save_vcpu_ctxt, + vmce_load_vcpu_ctxt, 1, HVMSR_PER_VCPU); + int inject_vmce(struct domain *d) { int cpu = smp_processor_id(); @@ -457,7 +516,7 @@ int vmce_init(struct cpuinfo_x86 *c) rdmsrl(MSR_IA32_MCG_CAP, value); /* For Guest vMCE usage */ - g_mcg_cap = value & ~MCG_CMCI_P; + g_mcg_cap = value & (MCG_CAP_COUNT | MCG_CTL_P | MCG_TES_P | MCG_SER_P); if (value & MCG_CTL_P) rdmsrl(MSR_IA32_MCG_CTL, h_mcg_ctl); --- a/xen/arch/x86/cpu/mcheck/x86_mca.h +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h @@ -30,12 +30,13 @@ /* Bitfield of the MSR_IA32_MCG_CAP register */ -#define MCG_SER_P (1UL<<24) #define MCG_CAP_COUNT 0x00000000000000ffULL -#define MCG_CTL_P 0x0000000000000100ULL -#define MCG_EXT_P (1UL<<9) -#define MCG_EXT_CNT (16) -#define MCG_CMCI_P (1UL<<10) +#define MCG_CTL_P (1ULL<<8) +#define MCG_EXT_P (1ULL<<9) +#define MCG_CMCI_P (1ULL<<10) +#define MCG_TES_P (1ULL<<11) +#define MCG_EXT_CNT 16 +#define MCG_SER_P (1ULL<<24) /* Other bits are reserved */ /* Bitfield of the MSR_IA32_MCG_STATUS register */ --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -422,6 +422,8 @@ int vcpu_initialise(struct vcpu *v) if ( (rc = vcpu_init_fpu(v)) != 0 ) return rc; + vmce_init_vcpu(v); + if ( is_hvm_domain(d) ) { rc = hvm_vcpu_initialise(v); --- a/xen/arch/x86/domctl.c +++ b/xen/arch/x86/domctl.c @@ -1027,11 +1027,12 @@ long arch_do_domctl( evc->syscall32_callback_eip = 0; evc->syscall32_disables_events = 0; } + evc->mcg_cap = v->arch.mcg_cap; } else { ret = -EINVAL; - if ( evc->size != sizeof(*evc) ) + if ( evc->size < offsetof(typeof(*evc), mcg_cap) ) goto ext_vcpucontext_out; #ifdef __x86_64__ if ( !is_hvm_domain(d) ) @@ -1059,6 +1060,10 @@ long arch_do_domctl( (evc->syscall32_callback_cs & ~3) || evc->syscall32_callback_eip ) goto ext_vcpucontext_out; + + if ( evc->size >= offsetof(typeof(*evc), mcg_cap) + + sizeof(evc->mcg_cap) ) + ret = vmce_restore_vcpu(v, evc->mcg_cap); } ret = 0; --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -488,6 +488,8 @@ struct arch_vcpu /* This variable determines whether nonlazy extended state has been used, * and thus should be saved/restored. */ bool_t nonlazy_xstate_used; + + uint64_t mcg_cap; struct paging_vcpu paging; --- a/xen/include/asm-x86/mce.h +++ b/xen/include/asm-x86/mce.h @@ -16,7 +16,6 @@ struct bank_entry { struct domain_mca_msrs { /* Guest should not change below values after DOM boot up */ - uint64_t mcg_cap; uint64_t mcg_ctl; uint64_t mcg_status; uint64_t *mci_ctl; @@ -28,6 +27,8 @@ struct domain_mca_msrs /* Guest vMCE MSRs virtualization */ extern int vmce_init_msr(struct domain *d); extern void vmce_destroy_msr(struct domain *d); +extern void vmce_init_vcpu(struct vcpu *); +extern int vmce_restore_vcpu(struct vcpu *, uint64_t caps); extern int vmce_wrmsr(uint32_t msr, uint64_t val); extern int vmce_rdmsr(uint32_t msr, uint64_t *val); --- a/xen/include/public/arch-x86/hvm/save.h +++ b/xen/include/public/arch-x86/hvm/save.h @@ -575,9 +575,15 @@ struct hvm_viridian_vcpu_context { DECLARE_HVM_SAVE_TYPE(VIRIDIAN_VCPU, 17, struct hvm_viridian_vcpu_context); +struct hvm_vmce_vcpu { + uint64_t caps; +}; + +DECLARE_HVM_SAVE_TYPE(VMCE_VCPU, 18, struct hvm_vmce_vcpu); + /* * Largest type-code in use */ -#define HVM_SAVE_CODE_MAX 17 +#define HVM_SAVE_CODE_MAX 18 #endif /* __XEN_PUBLIC_HVM_SAVE_X86_H__ */ --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -559,7 +559,7 @@ struct xen_domctl_ext_vcpucontext { uint32_t vcpu; /* * SET: Size of struct (IN) - * GET: Size of struct (OUT) + * GET: Size of struct (OUT, up to 128 bytes) */ uint32_t size; #if defined(__i386__) || defined(__x86_64__) @@ -571,6 +571,7 @@ struct xen_domctl_ext_vcpucontext { uint16_t sysenter_callback_cs; uint8_t syscall32_disables_events; uint8_t sysenter_disables_events; + uint64_aligned_t mcg_cap; #endif }; typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t;