[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] vMCE vs migration
Jan Beulich wrote: >>>> On 26.01.12 at 17:54, George Dunlap <george.dunlap@xxxxxxxxxx> >>>> wrote: >> On Tue, 2012-01-24 at 11:08 +0000, Jan Beulich wrote: >>>>>> On 24.01.12 at 11:29, George Dunlap >>>>>> <George.Dunlap@xxxxxxxxxxxxx> wrote: >>>> On Mon, Jan 23, 2012 at 11:08 AM, Jan Beulich <JBeulich@xxxxxxxx> >>>> wrote: >>>>> x86's vMCE implementation lets a guest know of as many MCE >>>>> reporting banks as there are in the host. While a PV guest could >>>>> be expected to deal with this number changing (particularly >>>>> decreasing) during migration (not currently handled anywhere >>>>> afaict), for HVM guests this is certainly wrong. >>>>> >>>>> At least to me it isn't, however, clear how to properly handle >>>>> this. The easiest would appear to be to save and restore the >>>>> number of banks >>>>> the guest was made believe it can access, making vmce_{rd,wr}msr() >>>>> silently tolerate accesses between the host and guest values. >>>> >>>> We ran into this in the XS 6.0 release as well. I think that the >>>> ideal thing to do would be to have a parameter that can be set at >>>> boot, to say how many vMCE banks a guest has, defaulting to the >>>> number of MCE banks on the host. This parameter would be >>>> preserved across migration. Ideally, a pool-aware toolstack like >>>> xapi would then set this value to be the value of the host in the >>>> pool with the largest number of banks, allowing a guest to access >>>> all the banks on any host to which it migrates. >>>> >>>> What do you think? >>> >>> That sounds like the way to go. >> >> So should we put this on IanC's to-do-be-done list? Are you going to >> put it on your to-do list? :-) > > Below/attached a draft patch (compile tested only), handling save/ > restore of the bank count, but not allowing for a config setting to > specify its initial value (yet). > > Jan > > --- a/tools/libxc/xc_domain_save.c > +++ b/tools/libxc/xc_domain_save.c > @@ -1879,6 +1879,7 @@ int xc_domain_save(xc_interface *xch, in > > domctl.cmd = XEN_DOMCTL_get_ext_vcpucontext; > domctl.domain = dom; > + memset(&domctl.u, 0, sizeof(domctl.u)); > domctl.u.ext_vcpucontext.vcpu = i; > if ( xc_domctl(xch, &domctl) < 0 ) > { > --- a/tools/misc/xen-hvmctx.c > +++ b/tools/misc/xen-hvmctx.c > @@ -383,6 +383,13 @@ static void dump_viridian_vcpu(void) > (unsigned long long) p.apic_assist); > } > > +static void dump_vmce_vcpu(void) > +{ > + HVM_SAVE_TYPE(VMCE_VCPU) p; > + READ(p); > + printf(" VMCE_VCPU: nr_banks %u\n", p.nr_banks); > +} > + > int main(int argc, char **argv) > { > int entry, domid; > @@ -449,6 +456,7 @@ int main(int argc, char **argv) > case HVM_SAVE_CODE(MTRR): dump_mtrr(); break; > case HVM_SAVE_CODE(VIRIDIAN_DOMAIN): dump_viridian_domain(); > break; case HVM_SAVE_CODE(VIRIDIAN_VCPU): > dump_viridian_vcpu(); break; + case HVM_SAVE_CODE(VMCE_VCPU): > dump_vmce_vcpu(); break; case HVM_SAVE_CODE(END): break; > default: > printf(" ** Don't understand type %u: skipping\n", > --- a/xen/arch/x86/cpu/mcheck/mce.h > +++ b/xen/arch/x86/cpu/mcheck/mce.h > @@ -3,6 +3,7 @@ > #define _MCE_H > > #include <xen/init.h> > +#include <xen/sched.h> > #include <xen/smp.h> > #include <asm/types.h> > #include <asm/traps.h> > @@ -54,8 +55,8 @@ int unmmap_broken_page(struct domain *d, > u64 mce_cap_init(void); > extern int firstbank; > > -int intel_mce_rdmsr(uint32_t msr, uint64_t *val); > -int intel_mce_wrmsr(uint32_t msr, uint64_t val); > +int intel_mce_rdmsr(const struct vcpu *, uint32_t msr, uint64_t > *val); +int intel_mce_wrmsr(struct vcpu *, uint32_t msr, uint64_t > val); > > struct mcinfo_extended *intel_get_extended_msrs( > struct mcinfo_global *mig, struct mc_info *mi); > @@ -171,18 +172,20 @@ int vmce_domain_inject(struct mcinfo_ban > > extern int vmce_init(struct cpuinfo_x86 *c); > > -static inline int mce_vendor_bank_msr(uint32_t msr) > +static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t > msr) { > if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && > - msr >= MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 + > nr_mce_banks) ) + msr >= MSR_IA32_MC0_CTL2 && > + msr < MSR_IA32_MCx_CTL2(v->arch.nr_vmce_banks) ) > return 1; > return 0; > } > > -static inline int mce_bank_msr(uint32_t msr) > +static inline int mce_bank_msr(const struct vcpu *v, uint32_t msr) > { > - if ( (msr >= MSR_IA32_MC0_CTL && msr < > MSR_IA32_MCx_CTL(nr_mce_banks)) || > - mce_vendor_bank_msr(msr) ) > + if ( (msr >= MSR_IA32_MC0_CTL && > + msr < MSR_IA32_MCx_CTL(v->arch.nr_vmce_banks)) || > + mce_vendor_bank_msr(v, msr) ) > return 1; > return 0; > } > --- a/xen/arch/x86/cpu/mcheck/mce_intel.c > +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c > @@ -1421,11 +1421,12 @@ enum mcheck_type intel_mcheck_init(struc > } > > /* intel specific MCA MSR */ > -int intel_mce_wrmsr(uint32_t msr, uint64_t val) > +int intel_mce_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) > { > int ret = 0; > > - if (msr >= MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 + > nr_mce_banks)) + if ( msr >= MSR_IA32_MC0_CTL2 && > + msr < MSR_IA32_MCx_CTL2(v->arch.nr_vmce_banks) ) > { > mce_printk(MCE_QUIET, "We have disabled CMCI capability, " > "Guest should not write this MSR!\n"); > @@ -1435,11 +1436,12 @@ int intel_mce_wrmsr(uint32_t msr, uint64 > return ret; > } > > -int intel_mce_rdmsr(uint32_t msr, uint64_t *val) > +int intel_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t > *val) { > int ret = 0; > > - if (msr >= MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 + > nr_mce_banks)) + if ( msr >= MSR_IA32_MC0_CTL2 && > + msr < MSR_IA32_MCx_CTL2(v->arch.nr_vmce_banks) ) > { > mce_printk(MCE_QUIET, "We have disabled CMCI capability, " > "Guest should not read this MSR!\n"); > --- a/xen/arch/x86/cpu/mcheck/vmce.c > +++ b/xen/arch/x86/cpu/mcheck/vmce.c > @@ -10,6 +10,7 @@ > #include <xen/delay.h> > #include <xen/smp.h> > #include <xen/mm.h> > +#include <xen/hvm/save.h> > #include <asm/processor.h> > #include <public/sysctl.h> > #include <asm/system.h> > @@ -61,21 +62,26 @@ void vmce_destroy_msr(struct domain *d) > dom_vmce(d) = NULL; > } > > -static int bank_mce_rdmsr(struct domain *d, uint32_t msr, uint64_t > *val) +void vmce_init_vcpu(struct vcpu *v) > { > - int bank, ret = 1; > - struct domain_mca_msrs *vmce = dom_vmce(d); > + v->arch.nr_vmce_banks = nr_mce_banks; > +} > + > +static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, > uint64_t *val) +{ > + int ret = 1; > + unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4; > + struct domain_mca_msrs *vmce = dom_vmce(v->domain); > struct bank_entry *entry; > > - bank = (msr - MSR_IA32_MC0_CTL) / 4; > - if ( bank >= nr_mce_banks ) > - return -1; > + *val = 0; > > switch ( msr & (MSR_IA32_MC0_CTL | 3) ) > { > case MSR_IA32_MC0_CTL: > - *val = vmce->mci_ctl[bank] & > - (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL); > + if ( bank < nr_mce_banks ) > + *val = vmce->mci_ctl[bank] & > + (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL); > mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n", > bank, *val); > break; > @@ -126,7 +132,7 @@ static int bank_mce_rdmsr(struct domain > switch ( boot_cpu_data.x86_vendor ) > { > case X86_VENDOR_INTEL: > - ret = intel_mce_rdmsr(msr, val); > + ret = intel_mce_rdmsr(v, msr, val); > break; > default: > ret = 0; > @@ -145,13 +151,13 @@ static int bank_mce_rdmsr(struct domain > */ > int vmce_rdmsr(uint32_t msr, uint64_t *val) > { > - struct domain *d = current->domain; > - struct domain_mca_msrs *vmce = dom_vmce(d); > + const struct vcpu *cur = current; > + struct domain_mca_msrs *vmce = dom_vmce(cur->domain); > int ret = 1; > > *val = 0; > > - spin_lock(&dom_vmce(d)->lock); > + spin_lock(&vmce->lock); > > switch ( msr ) > { > @@ -173,28 +179,26 @@ int vmce_rdmsr(uint32_t msr, uint64_t *v > *val); > break; > default: > - ret = mce_bank_msr(msr) ? bank_mce_rdmsr(d, msr, val) : 0; > + ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) > : 0; break; > } > > - spin_unlock(&dom_vmce(d)->lock); > + spin_unlock(&vmce->lock); > return ret; > } > > -static int bank_mce_wrmsr(struct domain *d, u32 msr, u64 val) > +static int bank_mce_wrmsr(struct vcpu *v, u32 msr, u64 val) > { > - int bank, ret = 1; > - struct domain_mca_msrs *vmce = dom_vmce(d); > + int ret = 1; > + unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4; > + struct domain_mca_msrs *vmce = dom_vmce(v->domain); > struct bank_entry *entry = NULL; > > - bank = (msr - MSR_IA32_MC0_CTL) / 4; > - if ( bank >= nr_mce_banks ) > - return -EINVAL; > - > switch ( msr & (MSR_IA32_MC0_CTL | 3) ) > { > case MSR_IA32_MC0_CTL: > - vmce->mci_ctl[bank] = val; > + if ( bank < nr_mce_banks ) > + vmce->mci_ctl[bank] = val; > break; > case MSR_IA32_MC0_STATUS: > /* Give the first entry of the list, it corresponds to > current @@ -202,9 +206,9 @@ static int bank_mce_wrmsr(struct domain > * the guest, this node will be deleted. > * Only error bank is written. Non-error banks simply return. > */ > - if ( !list_empty(&dom_vmce(d)->impact_header) ) > + if ( !list_empty(&vmce->impact_header) ) > { > - entry = list_entry(dom_vmce(d)->impact_header.next, > + entry = list_entry(vmce->impact_header.next, > struct bank_entry, list); > if ( entry->bank == bank ) > entry->mci_status = val; > @@ -228,7 +232,7 @@ static int bank_mce_wrmsr(struct domain > switch ( boot_cpu_data.x86_vendor ) > { > case X86_VENDOR_INTEL: > - ret = intel_mce_wrmsr(msr, val); > + ret = intel_mce_wrmsr(v, msr, val); > break; > default: > ret = 0; > @@ -247,9 +251,9 @@ static int bank_mce_wrmsr(struct domain > */ > int vmce_wrmsr(u32 msr, u64 val) > { > - struct domain *d = current->domain; > + struct vcpu *cur = current; > struct bank_entry *entry = NULL; > - struct domain_mca_msrs *vmce = dom_vmce(d); > + struct domain_mca_msrs *vmce = dom_vmce(cur->domain); > int ret = 1; > > if ( !g_mcg_cap ) > @@ -266,7 +270,7 @@ int vmce_wrmsr(u32 msr, u64 val) > vmce->mcg_status = val; > mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", > val); /* For HVM guest, this is the point for deleting vMCE > injection node */ - if ( d->is_hvm && (vmce->nr_injection > 0) > ) + if ( is_hvm_vcpu(cur) && (vmce->nr_injection > 0) ) > { > vmce->nr_injection--; /* Should be 0 */ > if ( !list_empty(&vmce->impact_header) ) > @@ -293,7 +297,7 @@ int vmce_wrmsr(u32 msr, u64 val) > ret = -1; > break; > default: > - ret = mce_bank_msr(msr) ? bank_mce_wrmsr(d, msr, val) : 0; > + ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) > : 0; break; > } > > @@ -301,6 +305,47 @@ int vmce_wrmsr(u32 msr, u64 val) > return ret; > } > > +static int vmce_save_vcpu_ctxt(struct domain *d, > hvm_domain_context_t *h) +{ > + struct vcpu *v; > + int err = 0; > + > + for_each_vcpu( d, v ) { > + struct hvm_vmce_vcpu ctxt = { > + .nr_banks = v->arch.nr_vmce_banks > + }; > + > + err = hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt); > + if ( err ) > + break; > + } > + > + return err; > +} > + > +static int vmce_load_vcpu_ctxt(struct domain *d, > hvm_domain_context_t *h) +{ > + unsigned int vcpuid = hvm_load_instance(h); > + struct vcpu *v; > + struct hvm_vmce_vcpu ctxt; > + int err; > + > + if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) > + { > + gdprintk(XENLOG_ERR, "HVM restore: domain has no vcpu %u\n", > vcpuid); + return -EINVAL; > + } > + > + err = hvm_load_entry(VMCE_VCPU, h, &ctxt); > + if ( !err ) > + v->arch.nr_vmce_banks = ctxt.nr_banks; > + > + return err; > +} > + > +HVM_REGISTER_SAVE_RESTORE(VMCE_VCPU, vmce_save_vcpu_ctxt, > + vmce_load_vcpu_ctxt, 1, HVMSR_PER_VCPU); > + > int inject_vmce(struct domain *d) > { > int cpu = smp_processor_id(); > --- a/xen/arch/x86/domain.c > +++ b/xen/arch/x86/domain.c > @@ -468,6 +468,8 @@ int vcpu_initialise(struct vcpu *v) > > v->arch.pv_vcpu.ctrlreg[4] = > real_cr4_to_pv_guest_cr4(mmu_cr4_features); > > + vmce_init_vcpu(v); > + > rc = is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0; > done: > if ( rc ) > --- a/xen/arch/x86/domctl.c > +++ b/xen/arch/x86/domctl.c > @@ -1027,11 +1027,12 @@ long arch_do_domctl( > evc->syscall32_callback_eip = 0; > evc->syscall32_disables_events = 0; > } > + evc->nr_mce_banks = v->arch.nr_vmce_banks; > } > else > { > ret = -EINVAL; > - if ( evc->size != sizeof(*evc) ) > + if ( evc->size < offsetof(typeof(*evc), nr_mce_banks) ) > goto ext_vcpucontext_out; > #ifdef __x86_64__ > if ( !is_hvm_domain(d) ) > @@ -1059,6 +1060,16 @@ long arch_do_domctl( > (evc->syscall32_callback_cs & ~3) || > evc->syscall32_callback_eip ) > goto ext_vcpucontext_out; > + > + if ( evc->size >= offsetof(typeof(*evc), > mbz[ARRAY_SIZE(evc->mbz)]) ) + { > + unsigned int i; > + > + for ( i = 0; i < ARRAY_SIZE(evc->mbz); ++i ) > + if ( evc->mbz[i] ) > + goto ext_vcpucontext_out; > + v->arch.nr_vmce_banks = evc->nr_mce_banks; > + } > } > > ret = 0; > --- a/xen/include/asm-x86/domain.h > +++ b/xen/include/asm-x86/domain.h > @@ -488,6 +488,8 @@ struct arch_vcpu > /* This variable determines whether nonlazy extended state has > been used, * and thus should be saved/restored. */ > bool_t nonlazy_xstate_used; > + > + uint8_t nr_vmce_banks; > > struct paging_vcpu paging; > > --- a/xen/include/asm-x86/mce.h > +++ b/xen/include/asm-x86/mce.h > @@ -28,6 +28,7 @@ struct domain_mca_msrs > /* Guest vMCE MSRs virtualization */ > extern int vmce_init_msr(struct domain *d); > extern void vmce_destroy_msr(struct domain *d); > +extern void vmce_init_vcpu(struct vcpu *); > extern int vmce_wrmsr(uint32_t msr, uint64_t val); > extern int vmce_rdmsr(uint32_t msr, uint64_t *val); > > --- a/xen/include/public/arch-x86/hvm/save.h > +++ b/xen/include/public/arch-x86/hvm/save.h > @@ -585,9 +585,15 @@ struct hvm_viridian_vcpu_context { > > DECLARE_HVM_SAVE_TYPE(VIRIDIAN_VCPU, 17, struct > hvm_viridian_vcpu_context); > > +struct hvm_vmce_vcpu { > + uint8_t nr_banks; > +}; > + > +DECLARE_HVM_SAVE_TYPE(VMCE_VCPU, 18, struct hvm_vmce_vcpu); > + > /* > * Largest type-code in use > */ > -#define HVM_SAVE_CODE_MAX 17 > +#define HVM_SAVE_CODE_MAX 18 > > #endif /* __XEN_PUBLIC_HVM_SAVE_X86_H__ */ > --- a/xen/include/public/domctl.h > +++ b/xen/include/public/domctl.h > @@ -559,7 +559,7 @@ struct xen_domctl_ext_vcpucontext { > uint32_t vcpu; > /* > * SET: Size of struct (IN) > - * GET: Size of struct (OUT) > + * GET: Size of struct (OUT, up to 128 bytes) > */ > uint32_t size; > #if defined(__i386__) || defined(__x86_64__) > @@ -571,6 +571,10 @@ struct xen_domctl_ext_vcpucontext { > uint16_t sysenter_callback_cs; > uint8_t syscall32_disables_events; > uint8_t sysenter_disables_events; > + uint8_t nr_mce_banks; > + /* This array can be split and used for future extension. */ > + /* It is there just to grow the structure beyond 4.1's size. */ > + uint8_t mbz[9]; > #endif > }; > typedef struct xen_domctl_ext_vcpucontext > xen_domctl_ext_vcpucontext_t; Jan, How about use static bank size, say 256, which architecture (IA32_MCG_CAP MSR) defined max bank number? Thanks, Jinsong _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |