[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 2/2] x86/vMCE: save/restore MCA capabilities



Jan Beulich wrote:
> This allows migration to a host with less MCA banks than the source
> host had, while without this patch accesses to the excess banks' MSRs
> caused #GP-s in the guest after migration (and it depended on the
> guest 
> kernel whether this would be fatal).
> 
> A fundamental question is whether we should also save/restore MCG_CTL
> and MCi_CTL, as the HVM save record would better be defined to the
> complete state that needs saving from the beginning (I'm unsure
> whether 
> the save/restore logic allows for future extension of an existing
> record).

Not sure this point. I always feel confused about the meaning of 
MCG_CTL/MCi_CTL and their defination in SDM looks ambiguous to me.
ASK TONY FOR HELP: what the real h/w meaning of MCG_CTL/MCi_CTL? seems mce 
logic seldomly rely on them, especially bit-by-bit of MCi_CTL.

Another question is, why in the patch mcg_cap defined as per vcpu while others 
(mcg_ctl/ mcg_status/ mci_ctl) defined as per domain? Semantically it looks 
some weird anyway.

Thanks,
Jinsong

> 
> Of course, this change is expected to make migration from new to older
> Xen impossible (again I'm unsure what the save/restore logic does with
> records it doesn't even know about).
> 
> The (trivial) tools side change may seem unrelated, but the code
> should 
> have been that way from the beginning to allow the hypervisor to look
> at currently unused ext_vcpucontext fields without risking to read
> garbage when those fields get a meaning assigned in the future. This
> isn't being enforced here - should it be? (Obviously, for backwards
> compatibility, the hypervisor must assume these fields to be clear
> only 
> when the extended context's size exceeds the old original one.)
> 
> A future addition to this change might be to allow configuration of
> the 
> number of banks and other MCA capabilities for a guest before it
> starts (i.e. to not inherits the values seen on the first host it
> runs on). 
> 
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
> 
> --- a/tools/libxc/xc_domain_save.c
> +++ b/tools/libxc/xc_domain_save.c
> @@ -1879,6 +1879,7 @@ int xc_domain_save(xc_interface *xch, in
> 
>          domctl.cmd = XEN_DOMCTL_get_ext_vcpucontext;
>          domctl.domain = dom;
> +        memset(&domctl.u, 0, sizeof(domctl.u));
>          domctl.u.ext_vcpucontext.vcpu = i;
>          if ( xc_domctl(xch, &domctl) < 0 )
>          {
> --- a/tools/misc/xen-hvmctx.c
> +++ b/tools/misc/xen-hvmctx.c
> @@ -383,6 +383,13 @@ static void dump_viridian_vcpu(void)
>             (unsigned long long) p.apic_assist);
>  }
> 
> +static void dump_vmce_vcpu(void)
> +{
> +    HVM_SAVE_TYPE(VMCE_VCPU) p;
> +    READ(p);
> +    printf("    VMCE_VCPU: caps %" PRIx64 "\n", p.caps);
> +}
> +
>  int main(int argc, char **argv)
>  {
>      int entry, domid;
> @@ -449,6 +456,7 @@ int main(int argc, char **argv)
>          case HVM_SAVE_CODE(MTRR): dump_mtrr(); break;
>          case HVM_SAVE_CODE(VIRIDIAN_DOMAIN): dump_viridian_domain();
>          break; case HVM_SAVE_CODE(VIRIDIAN_VCPU):
> dump_viridian_vcpu(); break; +        case HVM_SAVE_CODE(VMCE_VCPU):
>          dump_vmce_vcpu(); break; case HVM_SAVE_CODE(END): break;
>          default:
>              printf(" ** Don't understand type %u: skipping\n",
> --- a/xen/arch/x86/cpu/mcheck/mce.h
> +++ b/xen/arch/x86/cpu/mcheck/mce.h
> @@ -3,6 +3,7 @@
>  #define _MCE_H
> 
>  #include <xen/init.h>
> +#include <xen/sched.h>
>  #include <xen/smp.h>
>  #include <asm/types.h>
>  #include <asm/traps.h>
> @@ -54,8 +55,8 @@ int unmmap_broken_page(struct domain *d,
>  u64 mce_cap_init(void);
>  extern unsigned int firstbank;
> 
> -int intel_mce_rdmsr(uint32_t msr, uint64_t *val);
> -int intel_mce_wrmsr(uint32_t msr, uint64_t val);
> +int intel_mce_rdmsr(const struct vcpu *, uint32_t msr, uint64_t
> *val); +int intel_mce_wrmsr(struct vcpu *, uint32_t msr, uint64_t
> val); 
> 
>  struct mcinfo_extended *intel_get_extended_msrs(
>      struct mcinfo_global *mig, struct mc_info *mi);
> @@ -171,18 +172,20 @@ int vmce_domain_inject(struct mcinfo_ban
> 
>  extern int vmce_init(struct cpuinfo_x86 *c);
> 
> -static inline int mce_vendor_bank_msr(uint32_t msr)
> +static inline int mce_vendor_bank_msr(const struct vcpu *v, uint32_t
>  msr) {
>      if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
> -         msr >= MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 +
> nr_mce_banks) ) +         msr >= MSR_IA32_MC0_CTL2 &&
> +         msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) )
>            return 1;
>      return 0;
>  }
> 
> -static inline int mce_bank_msr(uint32_t msr)
> +static inline int mce_bank_msr(const struct vcpu *v, uint32_t msr)
>  {
> -    if ( (msr >= MSR_IA32_MC0_CTL && msr <
> MSR_IA32_MCx_CTL(nr_mce_banks)) || 
> -        mce_vendor_bank_msr(msr) )
> +    if ( (msr >= MSR_IA32_MC0_CTL &&
> +          msr < MSR_IA32_MCx_CTL(v->arch.mcg_cap & MCG_CAP_COUNT)) ||
> +         mce_vendor_bank_msr(v, msr) )
>          return 1;
>      return 0;
>  }
> --- a/xen/arch/x86/cpu/mcheck/mce_intel.c
> +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
> @@ -1421,11 +1421,12 @@ enum mcheck_type intel_mcheck_init(struc
>  }
> 
>  /* intel specific MCA MSR */
> -int intel_mce_wrmsr(uint32_t msr, uint64_t val)
> +int intel_mce_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
>  {
>      int ret = 0;
> 
> -    if (msr >= MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 +
> nr_mce_banks)) +    if ( msr >= MSR_IA32_MC0_CTL2 &&
> +         msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) )
>      {
>          mce_printk(MCE_QUIET, "We have disabled CMCI capability, "
>                   "Guest should not write this MSR!\n");
> @@ -1435,11 +1436,12 @@ int intel_mce_wrmsr(uint32_t msr, uint64
>      return ret;
>  }
> 
> -int intel_mce_rdmsr(uint32_t msr, uint64_t *val)
> +int intel_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t
>  *val) {
>      int ret = 0;
> 
> -    if (msr >= MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 +
> nr_mce_banks)) +    if ( msr >= MSR_IA32_MC0_CTL2 &&
> +         msr < MSR_IA32_MCx_CTL2(v->arch.mcg_cap & MCG_CAP_COUNT) )
>      {
>          mce_printk(MCE_QUIET, "We have disabled CMCI capability, "
>                   "Guest should not read this MSR!\n");
> --- a/xen/arch/x86/cpu/mcheck/vmce.c
> +++ b/xen/arch/x86/cpu/mcheck/vmce.c
> @@ -10,6 +10,7 @@
>  #include <xen/delay.h>
>  #include <xen/smp.h>
>  #include <xen/mm.h>
> +#include <xen/hvm/save.h>
>  #include <asm/processor.h>
>  #include <public/sysctl.h>
>  #include <asm/system.h>
> @@ -42,7 +43,6 @@ int vmce_init_msr(struct domain *d)
>             nr_mce_banks * sizeof(*dom_vmce(d)->mci_ctl));
> 
>      dom_vmce(d)->mcg_status = 0x0;
> -    dom_vmce(d)->mcg_cap = g_mcg_cap;
>      dom_vmce(d)->mcg_ctl = ~(uint64_t)0x0;
>      dom_vmce(d)->nr_injection = 0;
> 
> @@ -61,21 +61,41 @@ void vmce_destroy_msr(struct domain *d)
>      dom_vmce(d) = NULL;
>  }
> 
> -static int bank_mce_rdmsr(struct domain *d, uint32_t msr, uint64_t
> *val) +void vmce_init_vcpu(struct vcpu *v)
>  {
> -    int bank, ret = 1;
> -    struct domain_mca_msrs *vmce = dom_vmce(d);
> +    v->arch.mcg_cap = g_mcg_cap;
> +}
> +
> +int vmce_restore_vcpu(struct vcpu *v, uint64_t caps)
> +{
> +    if ( caps & ~g_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P )
> +    {
> +        dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA
> capabilities" +                " %#" PRIx64 " for d%d:v%u (supported:
> %#Lx)\n", +                is_hvm_vcpu(v) ? "HVM" : "PV", caps,
> v->domain->domain_id, +                v->vcpu_id, g_mcg_cap &
> ~MCG_CAP_COUNT); +        return -EPERM;
> +    }
> +
> +    v->arch.mcg_cap = caps;
> +    return 0;
> +}
> +
> +static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr,
> uint64_t *val) +{
> +    int ret = 1;
> +    unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
> +    struct domain_mca_msrs *vmce = dom_vmce(v->domain);
>      struct bank_entry *entry;
> 
> -    bank = (msr - MSR_IA32_MC0_CTL) / 4;
> -    if ( bank >= nr_mce_banks )
> -        return -1;
> +    *val = 0;
> 
>      switch ( msr & (MSR_IA32_MC0_CTL | 3) )
>      {
>      case MSR_IA32_MC0_CTL:
> -        *val = vmce->mci_ctl[bank] &
> -            (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL);
> +        if ( bank < nr_mce_banks )
> +            *val = vmce->mci_ctl[bank] &
> +                   (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL);
>          mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n",
>                     bank, *val);
>          break;
> @@ -126,7 +146,7 @@ static int bank_mce_rdmsr(struct domain
>          switch ( boot_cpu_data.x86_vendor )
>          {
>          case X86_VENDOR_INTEL:
> -            ret = intel_mce_rdmsr(msr, val);
> +            ret = intel_mce_rdmsr(v, msr, val);
>              break;
>          default:
>              ret = 0;
> @@ -145,13 +165,13 @@ static int bank_mce_rdmsr(struct domain
>   */
>  int vmce_rdmsr(uint32_t msr, uint64_t *val)
>  {
> -    struct domain *d = current->domain;
> -    struct domain_mca_msrs *vmce = dom_vmce(d);
> +    const struct vcpu *cur = current;
> +    struct domain_mca_msrs *vmce = dom_vmce(cur->domain);
>      int ret = 1;
> 
>      *val = 0;
> 
> -    spin_lock(&dom_vmce(d)->lock);
> +    spin_lock(&vmce->lock);
> 
>      switch ( msr )
>      {
> @@ -162,39 +182,38 @@ int vmce_rdmsr(uint32_t msr, uint64_t *v
>                         "MCE: rdmsr MCG_STATUS 0x%"PRIx64"\n", *val);
>          break;
>      case MSR_IA32_MCG_CAP:
> -        *val = vmce->mcg_cap;
> +        *val = cur->arch.mcg_cap;
>          mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n",
>                     *val);
>          break;
>      case MSR_IA32_MCG_CTL:
>          /* Always 0 if no CTL support */
> -        *val = vmce->mcg_ctl & h_mcg_ctl;
> +        if ( cur->arch.mcg_cap & MCG_CTL_P )
> +            *val = vmce->mcg_ctl & h_mcg_ctl;
>          mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n",
>                     *val);
>          break;
>      default:
> -        ret = mce_bank_msr(msr) ? bank_mce_rdmsr(d, msr, val) : 0;
> +        ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val)
>          : 0; break;
>      }
> 
> -    spin_unlock(&dom_vmce(d)->lock);
> +    spin_unlock(&vmce->lock);
>      return ret;
>  }
> 
> -static int bank_mce_wrmsr(struct domain *d, u32 msr, u64 val)
> +static int bank_mce_wrmsr(struct vcpu *v, u32 msr, u64 val)
>  {
> -    int bank, ret = 1;
> -    struct domain_mca_msrs *vmce = dom_vmce(d);
> +    int ret = 1;
> +    unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
> +    struct domain_mca_msrs *vmce = dom_vmce(v->domain);
>      struct bank_entry *entry = NULL;
> 
> -    bank = (msr - MSR_IA32_MC0_CTL) / 4;
> -    if ( bank >= nr_mce_banks )
> -        return -EINVAL;
> -
>      switch ( msr & (MSR_IA32_MC0_CTL | 3) )
>      {
>      case MSR_IA32_MC0_CTL:
> -        vmce->mci_ctl[bank] = val;
> +        if ( bank < nr_mce_banks )
> +            vmce->mci_ctl[bank] = val;
>          break;
>      case MSR_IA32_MC0_STATUS:
>          /* Give the first entry of the list, it corresponds to
> current @@ -202,9 +221,9 @@ static int bank_mce_wrmsr(struct domain
>           * the guest, this node will be deleted.
>           * Only error bank is written. Non-error banks simply return.
>           */
> -        if ( !list_empty(&dom_vmce(d)->impact_header) )
> +        if ( !list_empty(&vmce->impact_header) )
>          {
> -            entry = list_entry(dom_vmce(d)->impact_header.next,
> +            entry = list_entry(vmce->impact_header.next,
>                                 struct bank_entry, list);
>              if ( entry->bank == bank )
>                  entry->mci_status = val;
> @@ -228,7 +247,7 @@ static int bank_mce_wrmsr(struct domain
>          switch ( boot_cpu_data.x86_vendor )
>          {
>          case X86_VENDOR_INTEL:
> -            ret = intel_mce_wrmsr(msr, val);
> +            ret = intel_mce_wrmsr(v, msr, val);
>              break;
>          default:
>              ret = 0;
> @@ -247,9 +266,9 @@ static int bank_mce_wrmsr(struct domain
>   */
>  int vmce_wrmsr(u32 msr, u64 val)
>  {
> -    struct domain *d = current->domain;
> +    struct vcpu *cur = current;
>      struct bank_entry *entry = NULL;
> -    struct domain_mca_msrs *vmce = dom_vmce(d);
> +    struct domain_mca_msrs *vmce = dom_vmce(cur->domain);
>      int ret = 1;
> 
>      if ( !g_mcg_cap )
> @@ -266,7 +285,7 @@ int vmce_wrmsr(u32 msr, u64 val)
>          vmce->mcg_status = val;
>          mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n",
>          val); /* For HVM guest, this is the point for deleting vMCE
> injection node */ -        if ( d->is_hvm && (vmce->nr_injection > 0)
> ) +        if ( is_hvm_vcpu(cur) && (vmce->nr_injection > 0) )
>          {
>              vmce->nr_injection--; /* Should be 0 */
>              if ( !list_empty(&vmce->impact_header) )
> @@ -293,7 +312,7 @@ int vmce_wrmsr(u32 msr, u64 val)
>          ret = -1;
>          break;
>      default:
> -        ret = mce_bank_msr(msr) ? bank_mce_wrmsr(d, msr, val) : 0;
> +        ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val)
>          : 0; break;
>      }
> 
> @@ -301,6 +320,46 @@ int vmce_wrmsr(u32 msr, u64 val)
>      return ret;
>  }
> 
> +static int vmce_save_vcpu_ctxt(struct domain *d,
> hvm_domain_context_t *h) +{
> +    struct vcpu *v;
> +    int err = 0;
> +
> +    for_each_vcpu( d, v ) {
> +        struct hvm_vmce_vcpu ctxt = {
> +            .caps = v->arch.mcg_cap
> +        };
> +
> +        err = hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt);
> +        if ( err )
> +            break;
> +    }
> +
> +    return err;
> +}
> +
> +static int vmce_load_vcpu_ctxt(struct domain *d,
> hvm_domain_context_t *h) +{
> +    unsigned int vcpuid = hvm_load_instance(h);
> +    struct vcpu *v;
> +    struct hvm_vmce_vcpu ctxt;
> +    int err;
> +
> +    if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
> +    {
> +        dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n",
> +                d->domain_id, vcpuid);
> +        err = -EINVAL;
> +    }
> +    else
> +        err = hvm_load_entry(VMCE_VCPU, h, &ctxt);
> +
> +    return err ?: vmce_restore_vcpu(v, ctxt.caps);
> +}
> +
> +HVM_REGISTER_SAVE_RESTORE(VMCE_VCPU, vmce_save_vcpu_ctxt,
> +                          vmce_load_vcpu_ctxt, 1, HVMSR_PER_VCPU);
> +
>  int inject_vmce(struct domain *d)
>  {
>      int cpu = smp_processor_id();
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -422,6 +422,8 @@ int vcpu_initialise(struct vcpu *v)
>      if ( (rc = vcpu_init_fpu(v)) != 0 )
>          return rc;
> 
> +    vmce_init_vcpu(v);
> +
>      if ( is_hvm_domain(d) )
>      {
>          rc = hvm_vcpu_initialise(v);
> --- a/xen/arch/x86/domctl.c
> +++ b/xen/arch/x86/domctl.c
> @@ -1027,11 +1027,12 @@ long arch_do_domctl(
>                  evc->syscall32_callback_eip    = 0;
>                  evc->syscall32_disables_events = 0;
>              }
> +            evc->mcg_cap = v->arch.mcg_cap;
>          }
>          else
>          {
>              ret = -EINVAL;
> -            if ( evc->size != sizeof(*evc) )
> +            if ( evc->size < offsetof(typeof(*evc), mcg_cap) )
>                  goto ext_vcpucontext_out;
>  #ifdef __x86_64__
>              if ( !is_hvm_domain(d) )
> @@ -1059,6 +1060,10 @@ long arch_do_domctl(
>                   (evc->syscall32_callback_cs & ~3) ||
>                   evc->syscall32_callback_eip )
>                  goto ext_vcpucontext_out;
> +
> +            if ( evc->size >= offsetof(typeof(*evc), mcg_cap) +
> +                              sizeof(evc->mcg_cap) )
> +                ret = vmce_restore_vcpu(v, evc->mcg_cap);
>          }
> 
>          ret = 0;
> --- a/xen/include/asm-x86/domain.h
> +++ b/xen/include/asm-x86/domain.h
> @@ -488,6 +488,8 @@ struct arch_vcpu
>      /* This variable determines whether nonlazy extended state has
>       been used, * and thus should be saved/restored. */
>      bool_t nonlazy_xstate_used;
> +
> +    uint64_t mcg_cap;
> 
>      struct paging_vcpu paging;
> 
> --- a/xen/include/asm-x86/mce.h
> +++ b/xen/include/asm-x86/mce.h
> @@ -16,7 +16,6 @@ struct bank_entry {
>  struct domain_mca_msrs
>  {
>      /* Guest should not change below values after DOM boot up */
> -    uint64_t mcg_cap;
>      uint64_t mcg_ctl;
>      uint64_t mcg_status;
>      uint64_t *mci_ctl;
> @@ -28,6 +27,8 @@ struct domain_mca_msrs
>  /* Guest vMCE MSRs virtualization */
>  extern int vmce_init_msr(struct domain *d);
>  extern void vmce_destroy_msr(struct domain *d);
> +extern void vmce_init_vcpu(struct vcpu *);
> +extern int vmce_restore_vcpu(struct vcpu *, uint64_t caps);
>  extern int vmce_wrmsr(uint32_t msr, uint64_t val);
>  extern int vmce_rdmsr(uint32_t msr, uint64_t *val);
> 
> --- a/xen/include/public/arch-x86/hvm/save.h
> +++ b/xen/include/public/arch-x86/hvm/save.h
> @@ -585,9 +585,15 @@ struct hvm_viridian_vcpu_context {
> 
>  DECLARE_HVM_SAVE_TYPE(VIRIDIAN_VCPU, 17, struct
> hvm_viridian_vcpu_context); 
> 
> +struct hvm_vmce_vcpu {
> +    uint64_t caps;
> +};
> +
> +DECLARE_HVM_SAVE_TYPE(VMCE_VCPU, 18, struct hvm_vmce_vcpu);
> +
>  /*
>   * Largest type-code in use
>   */
> -#define HVM_SAVE_CODE_MAX 17
> +#define HVM_SAVE_CODE_MAX 18
> 
>  #endif /* __XEN_PUBLIC_HVM_SAVE_X86_H__ */
> --- a/xen/include/public/domctl.h
> +++ b/xen/include/public/domctl.h
> @@ -559,7 +559,7 @@ struct xen_domctl_ext_vcpucontext {
>      uint32_t         vcpu;
>      /*
>       * SET: Size of struct (IN)
> -     * GET: Size of struct (OUT)
> +     * GET: Size of struct (OUT, up to 128 bytes)
>       */
>      uint32_t         size;
>  #if defined(__i386__) || defined(__x86_64__)
> @@ -571,6 +571,7 @@ struct xen_domctl_ext_vcpucontext {
>      uint16_t         sysenter_callback_cs;
>      uint8_t          syscall32_disables_events;
>      uint8_t          sysenter_disables_events;
> +    uint64_aligned_t mcg_cap;
>  #endif
>  };
>  typedef struct xen_domctl_ext_vcpucontext
> xen_domctl_ext_vcpucontext_t; 


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.