Xen project Mailing List

Re: [Xen-devel] vMCE vs migration

From: George Dunlap <george.dunlap@xxxxxxxxxx>

Date: Tue, 31 Jan 2012 11:28:01 +0000

Cc: George Dunlap <George.Dunlap@xxxxxxxxxxxxx>, Olaf Hering <olaf@xxxxxxxxx>, "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>, Ian Campbell <Ian.Campbell@xxxxxxxxxx>

Delivery-date: Tue, 31 Jan 2012 11:28:38 +0000

List-id: Xen developer discussion <xen-devel.lists.xensource.com>

On Tue, 2012-01-31 at 11:27 +0000, George Dunlap wrote: > On Mon, 2012-01-30 at 13:47 +0000, Jan Beulich wrote: > > >>> On 26.01.12 at 17:54, George Dunlap <george.dunlap@xxxxxxxxxx> wrote: > > > On Tue, 2012-01-24 at 11:08 +0000, Jan Beulich wrote: > > >> >>> On 24.01.12 at 11:29, George Dunlap <George.Dunlap@xxxxxxxxxxxxx> > > >> >>> wrote: > > >> > On Mon, Jan 23, 2012 at 11:08 AM, Jan Beulich <JBeulich@xxxxxxxx> > > >> > wrote: > > >> >> x86's vMCE implementation lets a guest know of as many MCE reporting > > >> >> banks as there are in the host. While a PV guest could be expected to > > >> >> deal with this number changing (particularly decreasing) during > > >> >> migration > > >> >> (not currently handled anywhere afaict), for HVM guests this is > > >> >> certainly > > >> >> wrong. > > >> >> > > >> >> At least to me it isn't, however, clear how to properly handle this. > > >> >> The > > >> >> easiest would appear to be to save and restore the number of banks > > >> >> the guest was made believe it can access, making vmce_{rd,wr}msr() > > >> >> silently tolerate accesses between the host and guest values. > > >> > > > >> > We ran into this in the XS 6.0 release as well. I think that the > > >> > ideal thing to do would be to have a parameter that can be set at > > >> > boot, to say how many vMCE banks a guest has, defaulting to the number > > >> > of MCE banks on the host. This parameter would be preserved across > > >> > migration. Ideally, a pool-aware toolstack like xapi would then set > > >> > this value to be the value of the host in the pool with the largest > > >> > number of banks, allowing a guest to access all the banks on any host > > >> > to which it migrates. > > >> > > > >> > What do you think? > > >> > > >> That sounds like the way to go. > > > > > > So should we put this on IanC's to-do-be-done list? Are you going to > > > put it on your to-do list? :-) > > > > Below/attached a draft patch (compile tested only), handling save/ > > restore of the bank count, but not allowing for a config setting to > > specify its initial value (yet). > > Looks pretty good for a first blush. Just one question: Why is the vmce > count made on a per-vcpu basis, rather than on a per-domain basis, like > the actual banks are? Is the host MCE stuff per-vcpu? (Per-pcpu, that is...) > > -George > > > +static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t > > *val) > > +{ > > + int ret = 1; > > + unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4; > > + struct domain_mca_msrs *vmce = dom_vmce(v->domain); > > struct bank_entry *entry; > > > > - bank = (msr - MSR_IA32_MC0_CTL) / 4; > > - if ( bank >= nr_mce_banks ) > > - return -1; > > + *val = 0; > > > > switch ( msr & (MSR_IA32_MC0_CTL | 3) ) > > { > > case MSR_IA32_MC0_CTL: > > - *val = vmce->mci_ctl[bank] & > > - (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL); > > + if ( bank < nr_mce_banks ) > > + *val = vmce->mci_ctl[bank] & > > + (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL); > > mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n", > > bank, *val); > > break; > > @@ -126,7 +132,7 @@ static int bank_mce_rdmsr(struct domain > > switch ( boot_cpu_data.x86_vendor ) > > { > > case X86_VENDOR_INTEL: > > - ret = intel_mce_rdmsr(msr, val); > > + ret = intel_mce_rdmsr(v, msr, val); > > break; > > default: > > ret = 0; > > @@ -145,13 +151,13 @@ static int bank_mce_rdmsr(struct domain > > */ > > int vmce_rdmsr(uint32_t msr, uint64_t *val) > > { > > - struct domain *d = current->domain; > > - struct domain_mca_msrs *vmce = dom_vmce(d); > > + const struct vcpu *cur = current; > > + struct domain_mca_msrs *vmce = dom_vmce(cur->domain); > > int ret = 1; > > > > *val = 0; > > > > - spin_lock(&dom_vmce(d)->lock); > > + spin_lock(&vmce->lock); > > > > switch ( msr ) > > { > > @@ -173,28 +179,26 @@ int vmce_rdmsr(uint32_t msr, uint64_t *v > > *val); > > break; > > default: > > - ret = mce_bank_msr(msr) ? bank_mce_rdmsr(d, msr, val) : 0; > > + ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0; > > break; > > } > > > > - spin_unlock(&dom_vmce(d)->lock); > > + spin_unlock(&vmce->lock); > > return ret; > > } > > > > -static int bank_mce_wrmsr(struct domain *d, u32 msr, u64 val) > > +static int bank_mce_wrmsr(struct vcpu *v, u32 msr, u64 val) > > { > > - int bank, ret = 1; > > - struct domain_mca_msrs *vmce = dom_vmce(d); > > + int ret = 1; > > + unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4; > > + struct domain_mca_msrs *vmce = dom_vmce(v->domain); > > struct bank_entry *entry = NULL; > > > > - bank = (msr - MSR_IA32_MC0_CTL) / 4; > > - if ( bank >= nr_mce_banks ) > > - return -EINVAL; > > - > > switch ( msr & (MSR_IA32_MC0_CTL | 3) ) > > { > > case MSR_IA32_MC0_CTL: > > - vmce->mci_ctl[bank] = val; > > + if ( bank < nr_mce_banks ) > > + vmce->mci_ctl[bank] = val; > > break; > > case MSR_IA32_MC0_STATUS: > > /* Give the first entry of the list, it corresponds to current > > @@ -202,9 +206,9 @@ static int bank_mce_wrmsr(struct domain > > * the guest, this node will be deleted. > > * Only error bank is written. Non-error banks simply return. > > */ > > - if ( !list_empty(&dom_vmce(d)->impact_header) ) > > + if ( !list_empty(&vmce->impact_header) ) > > { > > - entry = list_entry(dom_vmce(d)->impact_header.next, > > + entry = list_entry(vmce->impact_header.next, > > struct bank_entry, list); > > if ( entry->bank == bank ) > > entry->mci_status = val; > > @@ -228,7 +232,7 @@ static int bank_mce_wrmsr(struct domain > > switch ( boot_cpu_data.x86_vendor ) > > { > > case X86_VENDOR_INTEL: > > - ret = intel_mce_wrmsr(msr, val); > > + ret = intel_mce_wrmsr(v, msr, val); > > break; > > default: > > ret = 0; > > @@ -247,9 +251,9 @@ static int bank_mce_wrmsr(struct domain > > */ > > int vmce_wrmsr(u32 msr, u64 val) > > { > > - struct domain *d = current->domain; > > + struct vcpu *cur = current; > > struct bank_entry *entry = NULL; > > - struct domain_mca_msrs *vmce = dom_vmce(d); > > + struct domain_mca_msrs *vmce = dom_vmce(cur->domain); > > int ret = 1; > > > > if ( !g_mcg_cap ) > > @@ -266,7 +270,7 @@ int vmce_wrmsr(u32 msr, u64 val) > > vmce->mcg_status = val; > > mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", val); > > /* For HVM guest, this is the point for deleting vMCE injection > > node */ > > - if ( d->is_hvm && (vmce->nr_injection > 0) ) > > + if ( is_hvm_vcpu(cur) && (vmce->nr_injection > 0) ) > > { > > vmce->nr_injection--; /* Should be 0 */ > > if ( !list_empty(&vmce->impact_header) ) > > @@ -293,7 +297,7 @@ int vmce_wrmsr(u32 msr, u64 val) > > ret = -1; > > break; > > default: > > - ret = mce_bank_msr(msr) ? bank_mce_wrmsr(d, msr, val) : 0; > > + ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) : 0; > > break; > > } > > > > @@ -301,6 +305,47 @@ int vmce_wrmsr(u32 msr, u64 val) > > return ret; > > } > > > > +static int vmce_save_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h) > > +{ > > + struct vcpu *v; > > + int err = 0; > > + > > + for_each_vcpu( d, v ) { > > + struct hvm_vmce_vcpu ctxt = { > > + .nr_banks = v->arch.nr_vmce_banks > > + }; > > + > > + err = hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt); > > + if ( err ) > > + break; > > + } > > + > > + return err; > > +} > > + > > +static int vmce_load_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h) > > +{ > > + unsigned int vcpuid = hvm_load_instance(h); > > + struct vcpu *v; > > + struct hvm_vmce_vcpu ctxt; > > + int err; > > + > > + if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) > > + { > > + gdprintk(XENLOG_ERR, "HVM restore: domain has no vcpu %u\n", > > vcpuid); > > + return -EINVAL; > > + } > > + > > + err = hvm_load_entry(VMCE_VCPU, h, &ctxt); > > + if ( !err ) > > + v->arch.nr_vmce_banks = ctxt.nr_banks; > > + > > + return err; > > +} > > + > > +HVM_REGISTER_SAVE_RESTORE(VMCE_VCPU, vmce_save_vcpu_ctxt, > > + vmce_load_vcpu_ctxt, 1, HVMSR_PER_VCPU); > > + > > int inject_vmce(struct domain *d) > > { > > int cpu = smp_processor_id(); > > --- a/xen/arch/x86/domain.c > > +++ b/xen/arch/x86/domain.c > > @@ -468,6 +468,8 @@ int vcpu_initialise(struct vcpu *v) > > > > v->arch.pv_vcpu.ctrlreg[4] = > > real_cr4_to_pv_guest_cr4(mmu_cr4_features); > > > > + vmce_init_vcpu(v); > > + > > rc = is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0; > > done: > > if ( rc ) > > --- a/xen/arch/x86/domctl.c > > +++ b/xen/arch/x86/domctl.c > > @@ -1027,11 +1027,12 @@ long arch_do_domctl( > > evc->syscall32_callback_eip = 0; > > evc->syscall32_disables_events = 0; > > } > > + evc->nr_mce_banks = v->arch.nr_vmce_banks; > > } > > else > > { > > ret = -EINVAL; > > - if ( evc->size != sizeof(*evc) ) > > + if ( evc->size < offsetof(typeof(*evc), nr_mce_banks) ) > > goto ext_vcpucontext_out; > > #ifdef __x86_64__ > > if ( !is_hvm_domain(d) ) > > @@ -1059,6 +1060,16 @@ long arch_do_domctl( > > (evc->syscall32_callback_cs & ~3) || > > evc->syscall32_callback_eip ) > > goto ext_vcpucontext_out; > > + > > + if ( evc->size >= offsetof(typeof(*evc), > > mbz[ARRAY_SIZE(evc->mbz)]) ) > > + { > > + unsigned int i; > > + > > + for ( i = 0; i < ARRAY_SIZE(evc->mbz); ++i ) > > + if ( evc->mbz[i] ) > > + goto ext_vcpucontext_out; > > + v->arch.nr_vmce_banks = evc->nr_mce_banks; > > + } > > } > > > > ret = 0; > > --- a/xen/include/asm-x86/domain.h > > +++ b/xen/include/asm-x86/domain.h > > @@ -488,6 +488,8 @@ struct arch_vcpu > > /* This variable determines whether nonlazy extended state has been > > used, > > * and thus should be saved/restored. */ > > bool_t nonlazy_xstate_used; > > + > > + uint8_t nr_vmce_banks; > > > > struct paging_vcpu paging; > > > > --- a/xen/include/asm-x86/mce.h > > +++ b/xen/include/asm-x86/mce.h > > @@ -28,6 +28,7 @@ struct domain_mca_msrs > > /* Guest vMCE MSRs virtualization */ > > extern int vmce_init_msr(struct domain *d); > > extern void vmce_destroy_msr(struct domain *d); > > +extern void vmce_init_vcpu(struct vcpu *); > > extern int vmce_wrmsr(uint32_t msr, uint64_t val); > > extern int vmce_rdmsr(uint32_t msr, uint64_t *val); > > > > --- a/xen/include/public/arch-x86/hvm/save.h > > +++ b/xen/include/public/arch-x86/hvm/save.h > > @@ -585,9 +585,15 @@ struct hvm_viridian_vcpu_context { > > > > DECLARE_HVM_SAVE_TYPE(VIRIDIAN_VCPU, 17, struct hvm_viridian_vcpu_context); > > > > +struct hvm_vmce_vcpu { > > + uint8_t nr_banks; > > +}; > > + > > +DECLARE_HVM_SAVE_TYPE(VMCE_VCPU, 18, struct hvm_vmce_vcpu); > > + > > /* > > * Largest type-code in use > > */ > > -#define HVM_SAVE_CODE_MAX 17 > > +#define HVM_SAVE_CODE_MAX 18 > > > > #endif /* __XEN_PUBLIC_HVM_SAVE_X86_H__ */ > > --- a/xen/include/public/domctl.h > > +++ b/xen/include/public/domctl.h > > @@ -559,7 +559,7 @@ struct xen_domctl_ext_vcpucontext { > > uint32_t vcpu; > > /* > > * SET: Size of struct (IN) > > - * GET: Size of struct (OUT) > > + * GET: Size of struct (OUT, up to 128 bytes) > > */ > > uint32_t size; > > #if defined(__i386__) || defined(__x86_64__) > > @@ -571,6 +571,10 @@ struct xen_domctl_ext_vcpucontext { > > uint16_t sysenter_callback_cs; > > uint8_t syscall32_disables_events; > > uint8_t sysenter_disables_events; > > + uint8_t nr_mce_banks; > > + /* This array can be split and used for future extension. */ > > + /* It is there just to grow the structure beyond 4.1's size. */ > > + uint8_t mbz[9]; > > #endif > > }; > > typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t; > > > > _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.