[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH] x86/xsave: fix migration from xsave-capable to xsave-incapable host



On 30/08/2013 10:55, "Jan Beulich" <JBeulich@xxxxxxxx> wrote:

> With CPUID features suitably masked this is supposed to work, but was
> completely broken (i.e. the case wasn't even considered when the
> original xsave save/restore code was written).
> 
> First of all, xsave_enabled() wrongly returned the value of
> cpu_has_xsave, i.e. not even taking into consideration attributes of
> the vCPU in question. Instead this function ought to check whether the
> guest ever enabled xsave support (by writing a [non-zero] value to
> XCR0). As a result of this, a vCPU's xcr0 and xcr0_accum must no longer
> be initialized to XSTATE_FP_SSE (since that's a valid value a guest
> could write to XCR0), and the xsave/xrstor as well as the context
> switch code need to suitably account for this (by always enforcing at
> least this part of the state to be saved/loaded).
> 
> This involves undoing large parts of c/s 22945:13a7d1f7f62c ("x86: add
> strictly sanity check for XSAVE/XRSTOR") - we need to cleanly
> distinguish between hardware capabilities and vCPU used features.
> 
> Next both HVM and PV save code needed tweaking to not always save the
> full state supported by the underlying hardware, but just the parts
> that the guest actually used. Similarly the restore code should bail
> not just on state being restored that the hardware cannot handle, but
> also on inconsistent save state (inconsistent XCR0 settings or size of
> saved state not in line with XCR0).
> 
> And finally the PV extended context get/set code needs to use slightly
> different logic than the HVM one, as here we can't just key off of
> xsave_enabled() (i.e. avoid doing anything if a guest doesn't use
> xsave) because the tools use this function to determine host
> capabilities as well as read/write vCPU state. The set operation in
> particular needs to be capable of cleanly dealing with input that
> consists of only the xcr0 and xcr0_accum values (if they're both zero
> then no further data is required).
> 
> While for things to work correctly both sides (saving _and_ restoring
> host) need to run with the fixed code, afaict no breakage should occur
> if either side isn't up to date (other than the breakage that this
> patch attempts to fix).
> 
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>

Acked-by: Keir Fraser <keir@xxxxxxx>

> ---
> Please note that the development and testing of this was done on 4.1.x,
> as that's where to issue was found. The xsave code being quite
> different between then and now, the porting was not really straight
> forward, but in the end the biggest difference was that the 4.1.x code
> needs more changes than presented here, due to there FPU context and
> XSAVE area not sharing the same memory block. Hence I think/hope I did
> all the porting over correctly, but I had no setup available where I
> could have myself fully tested all the scenarios.
> 
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -618,7 +618,7 @@ unsigned long pv_guest_cr4_fixup(const s
>          hv_cr4_mask &= ~X86_CR4_DE;
>      if ( cpu_has_fsgsbase && !is_pv_32bit_domain(v->domain) )
>          hv_cr4_mask &= ~X86_CR4_FSGSBASE;
> -    if ( xsave_enabled(v) )
> +    if ( cpu_has_xsave )
>          hv_cr4_mask &= ~X86_CR4_OSXSAVE;
>  
>      if ( (guest_cr4 & hv_cr4_mask) != (hv_cr4 & hv_cr4_mask) )
> @@ -1351,9 +1351,13 @@ static void __context_switch(void)
>      if ( !is_idle_vcpu(n) )
>      {
>          memcpy(stack_regs, &n->arch.user_regs, CTXT_SWITCH_STACK_BYTES);
> -        if ( xsave_enabled(n) && n->arch.xcr0 != get_xcr0() &&
> -             !set_xcr0(n->arch.xcr0) )
> -            BUG();
> +        if ( cpu_has_xsave )
> +        {
> +            u64 xcr0 = n->arch.xcr0 ?: XSTATE_FP_SSE;
> +
> +            if ( xcr0 != get_xcr0() && !set_xcr0(xcr0) )
> +                BUG();
> +        }
>          vcpu_restore_fpu_eager(n);
>          n->arch.ctxt_switch_to(n);
>      }
> --- a/xen/arch/x86/domctl.c
> +++ b/xen/arch/x86/domctl.c
> @@ -1047,11 +1047,8 @@ long arch_do_domctl(
>          struct xen_domctl_vcpuextstate *evc;
>          struct vcpu *v;
>          uint32_t offset = 0;
> -        uint64_t _xfeature_mask = 0;
> -        uint64_t _xcr0, _xcr0_accum;
> -        void *receive_buf = NULL, *_xsave_area;
>  
> -#define PV_XSAVE_SIZE (2 * sizeof(uint64_t) + xsave_cntxt_size)
> +#define PV_XSAVE_SIZE(xcr0) (2 * sizeof(uint64_t) + xstate_ctxt_size(xcr0))
>  
>          evc = &domctl->u.vcpuextstate;
>  
> @@ -1062,15 +1059,16 @@ long arch_do_domctl(
>  
>          if ( domctl->cmd == XEN_DOMCTL_getvcpuextstate )
>          {
> +            unsigned int size = PV_XSAVE_SIZE(v->arch.xcr0_accum);
> +
>              if ( !evc->size && !evc->xfeature_mask )
>              {
>                  evc->xfeature_mask = xfeature_mask;
> -                evc->size = PV_XSAVE_SIZE;
> +                evc->size = size;
>                  ret = 0;
>                  goto vcpuextstate_out;
>              }
> -            if ( evc->size != PV_XSAVE_SIZE ||
> -                 evc->xfeature_mask != xfeature_mask )
> +            if ( evc->size != size || evc->xfeature_mask != xfeature_mask )
>              {
>                  ret = -EINVAL;
>                  goto vcpuextstate_out;
> @@ -1093,7 +1091,7 @@ long arch_do_domctl(
>              offset += sizeof(v->arch.xcr0_accum);
>              if ( copy_to_guest_offset(domctl->u.vcpuextstate.buffer,
>                                        offset, (void *)v->arch.xsave_area,
> -                                      xsave_cntxt_size) )
> +                                      size - 2 * sizeof(uint64_t)) )
>              {
>                  ret = -EFAULT;
>                  goto vcpuextstate_out;
> @@ -1101,13 +1099,14 @@ long arch_do_domctl(
>          }
>          else
>          {
> -            ret = -EINVAL;
> +            void *receive_buf;
> +            uint64_t _xcr0, _xcr0_accum;
> +            const struct xsave_struct *_xsave_area;
>  
> -            _xfeature_mask = evc->xfeature_mask;
> -            /* xsave context must be restored on compatible target CPUs */
> -            if ( (_xfeature_mask & xfeature_mask) != _xfeature_mask )
> -                goto vcpuextstate_out;
> -            if ( evc->size > PV_XSAVE_SIZE || evc->size < 2 *
> sizeof(uint64_t) )
> +            ret = -EINVAL;
> +            if ( evc->size < 2 * sizeof(uint64_t) ||
> +                 evc->size > 2 * sizeof(uint64_t) +
> +                             xstate_ctxt_size(xfeature_mask) )
>                  goto vcpuextstate_out;
>  
>              receive_buf = xmalloc_bytes(evc->size);
> @@ -1128,20 +1127,30 @@ long arch_do_domctl(
>              _xcr0_accum = *(uint64_t *)(receive_buf + sizeof(uint64_t));
>              _xsave_area = receive_buf + 2 * sizeof(uint64_t);
>  
> -            if ( !(_xcr0 & XSTATE_FP) || _xcr0 & ~xfeature_mask )
> +            if ( _xcr0_accum )
>              {
> -                xfree(receive_buf);
> -                goto vcpuextstate_out;
> +                if ( evc->size >= 2 * sizeof(uint64_t) + XSTATE_AREA_MIN_SIZE
> )
> +                    ret = validate_xstate(_xcr0, _xcr0_accum,
> +                                          _xsave_area->xsave_hdr.xstate_bv,
> +                                          evc->xfeature_mask);
>              }
> -            if ( (_xcr0 & _xcr0_accum) != _xcr0 )
> +            else if ( !_xcr0 )
> +                ret = 0;
> +            if ( ret )
>              {
>                  xfree(receive_buf);
>                  goto vcpuextstate_out;
>              }
>  
> -            v->arch.xcr0 = _xcr0;
> -            v->arch.xcr0_accum = _xcr0_accum;
> -            memcpy(v->arch.xsave_area, _xsave_area, evc->size - 2 *
> sizeof(uint64_t) );
> +            if ( evc->size <= PV_XSAVE_SIZE(_xcr0_accum) )
> +            {
> +                v->arch.xcr0 = _xcr0;
> +                v->arch.xcr0_accum = _xcr0_accum;
> +                memcpy(v->arch.xsave_area, _xsave_area,
> +                       evc->size - 2 * sizeof(uint64_t));
> +            }
> +            else
> +                ret = -EINVAL;
>  
>              xfree(receive_buf);
>          }
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -906,14 +906,12 @@ static int hvm_load_cpu_ctxt(struct doma
>      hvm_set_segment_register(v, x86_seg_ldtr, &seg);
>  
>      /* In case xsave-absent save file is restored on a xsave-capable host */
> -    if ( xsave_enabled(v) )
> +    if ( cpu_has_xsave && !xsave_enabled(v) )
>      {
>          struct xsave_struct *xsave_area = v->arch.xsave_area;
>  
>          memcpy(v->arch.xsave_area, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
>          xsave_area->xsave_hdr.xstate_bv = XSTATE_FP_SSE;
> -        v->arch.xcr0_accum = XSTATE_FP_SSE;
> -        v->arch.xcr0 = XSTATE_FP_SSE;
>      }
>      else
>          memcpy(v->arch.fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs));
> @@ -957,7 +955,9 @@ static int hvm_load_cpu_ctxt(struct doma
>  HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt,
>                            1, HVMSR_PER_VCPU);
>  
> -#define HVM_CPU_XSAVE_SIZE  (3 * sizeof(uint64_t) + xsave_cntxt_size)
> +#define HVM_CPU_XSAVE_SIZE(xcr0) (offsetof(struct hvm_hw_cpu_xsave, \
> +                                           save_area) + \
> +                                  xstate_ctxt_size(xcr0))
>  
>  static int hvm_save_cpu_xsave_states(struct domain *d, hvm_domain_context_t
> *h)
>  {
> @@ -969,20 +969,20 @@ static int hvm_save_cpu_xsave_states(str
>  
>      for_each_vcpu ( d, v )
>      {
> +        unsigned int size = HVM_CPU_XSAVE_SIZE(v->arch.xcr0_accum);
> +
>          if ( !xsave_enabled(v) )
>              continue;
> -        if ( _hvm_init_entry(h, CPU_XSAVE_CODE, v->vcpu_id,
> HVM_CPU_XSAVE_SIZE) )
> +        if ( _hvm_init_entry(h, CPU_XSAVE_CODE, v->vcpu_id, size) )
>              return 1;
>          ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur];
> -        h->cur += HVM_CPU_XSAVE_SIZE;
> -        memset(ctxt, 0, HVM_CPU_XSAVE_SIZE);
> +        h->cur += size;
>  
>          ctxt->xfeature_mask = xfeature_mask;
>          ctxt->xcr0 = v->arch.xcr0;
>          ctxt->xcr0_accum = v->arch.xcr0_accum;
> -        if ( v->fpu_initialised )
> -            memcpy(&ctxt->save_area,
> -                v->arch.xsave_area, xsave_cntxt_size);
> +        memcpy(&ctxt->save_area, v->arch.xsave_area,
> +               size - offsetof(struct hvm_hw_cpu_xsave, save_area));
>      }
>  
>      return 0;
> @@ -990,11 +990,11 @@ static int hvm_save_cpu_xsave_states(str
>  
>  static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t
> *h)
>  {
> -    int vcpuid;
> +    unsigned int vcpuid, size;
> +    int err;
>      struct vcpu *v;
>      struct hvm_hw_cpu_xsave *ctxt;
>      struct hvm_save_descriptor *desc;
> -    uint64_t _xfeature_mask;
>  
>      /* Which vcpu is this? */
>      vcpuid = hvm_load_instance(h);
> @@ -1006,47 +1006,74 @@ static int hvm_load_cpu_xsave_states(str
>      }
>  
>      /* Fails since we can't restore an img saved on xsave-capable host. */
> -    if ( !xsave_enabled(v) )
> -        return -EINVAL;
> +    if ( !cpu_has_xsave )
> +        return -EOPNOTSUPP;
>  
>      /* Customized checking for entry since our entry is of variable length */
>      desc = (struct hvm_save_descriptor *)&h->data[h->cur];
>      if ( sizeof (*desc) > h->size - h->cur)
>      {
>          printk(XENLOG_G_WARNING
> -               "HVM%d restore: not enough data left to read descriptor"
> -               "for type %u\n", d->domain_id, CPU_XSAVE_CODE);
> -        return -1;
> +               "HVM%d.%d restore: not enough data left to read xsave
> descriptor\n",
> +               d->domain_id, vcpuid);
> +        return -ENODATA;
>      }
>      if ( desc->length + sizeof (*desc) > h->size - h->cur)
>      {
>          printk(XENLOG_G_WARNING
> -               "HVM%d restore: not enough data left to read %u bytes "
> -               "for type %u\n", d->domain_id, desc->length, CPU_XSAVE_CODE);
> -        return -1;
> +               "HVM%d.%d restore: not enough data left to read %u xsave
> bytes\n",
> +               d->domain_id, vcpuid, desc->length);
> +        return -ENODATA;
> +    }
> +    if ( desc->length < offsetof(struct hvm_hw_cpu_xsave, save_area) +
> +                        XSTATE_AREA_MIN_SIZE )
> +    {
> +        printk(XENLOG_G_WARNING
> +               "HVM%d.%d restore mismatch: xsave length %u < %zu\n",
> +               d->domain_id, vcpuid, desc->length,
> +               offsetof(struct hvm_hw_cpu_xsave,
> +                        save_area) + XSTATE_AREA_MIN_SIZE);
> +        return -EINVAL;
>      }
> -    if ( CPU_XSAVE_CODE != desc->typecode || (desc->length >
> HVM_CPU_XSAVE_SIZE) )
> +    size = HVM_CPU_XSAVE_SIZE(xfeature_mask);
> +    if ( desc->length > size )
>      {
>          printk(XENLOG_G_WARNING
> -               "HVM%d restore mismatch: expected type %u with max length %u,
> "
> -               "saw type %u length %u\n", d->domain_id, CPU_XSAVE_CODE,
> -               (unsigned int)HVM_CPU_XSAVE_SIZE,
> -               desc->typecode, desc->length);
> -        return -1;
> +               "HVM%d.%d restore mismatch: xsave length %u > %u\n",
> +               d->domain_id, vcpuid, desc->length, size);
> +        return -EOPNOTSUPP;
>      }
>      h->cur += sizeof (*desc);
> -    /* Checking finished */
>  
>      ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur];
>      h->cur += desc->length;
>  
> -    _xfeature_mask = ctxt->xfeature_mask;
> -    if ( (_xfeature_mask & xfeature_mask) != _xfeature_mask )
> -        return -EINVAL;
> +    err = validate_xstate(ctxt->xcr0, ctxt->xcr0_accum,
> +                          ctxt->save_area.xsave_hdr.xstate_bv,
> +                          ctxt->xfeature_mask);
> +    if ( err )
> +    {
> +        printk(XENLOG_G_WARNING
> +               "HVM%d.%d restore: inconsistent xsave state (feat=%#"PRIx64
> +               " accum=%#"PRIx64" xcr0=%#"PRIx64" bv=%#"PRIx64" err=%d)\n",
> +               d->domain_id, vcpuid, ctxt->xfeature_mask, ctxt->xcr0_accum,
> +               ctxt->xcr0, ctxt->save_area.xsave_hdr.xstate_bv, err);
> +        return err;
> +    }
> +    size = HVM_CPU_XSAVE_SIZE(ctxt->xcr0_accum);
> +    if ( desc->length > size )
> +    {
> +        printk(XENLOG_G_WARNING
> +               "HVM%d.%d restore mismatch: xsave length %u > %u\n",
> +               d->domain_id, vcpuid, desc->length, size);
> +        return -EOPNOTSUPP;
> +    }
> +    /* Checking finished */
>  
>      v->arch.xcr0 = ctxt->xcr0;
>      v->arch.xcr0_accum = ctxt->xcr0_accum;
> -    memcpy(v->arch.xsave_area, &ctxt->save_area, xsave_cntxt_size);
> +    memcpy(v->arch.xsave_area, &ctxt->save_area,
> +           desc->length - offsetof(struct hvm_hw_cpu_xsave, save_area));
>  
>      return 0;
>  }
> @@ -1060,7 +1087,8 @@ static int __init __hvm_register_CPU_XSA
>                          "CPU_XSAVE",
>                          hvm_save_cpu_xsave_states,
>                          hvm_load_cpu_xsave_states,
> -                        HVM_CPU_XSAVE_SIZE + sizeof (struct
> hvm_save_descriptor),
> +                        HVM_CPU_XSAVE_SIZE(xfeature_mask) +
> +                            sizeof(struct hvm_save_descriptor),
>                          HVMSR_PER_VCPU);
>      return 0;
>  }
> @@ -2767,7 +2795,7 @@ void hvm_cpuid(unsigned int input, unsig
>              __clear_bit(X86_FEATURE_APIC & 31, edx);
>  
>          /* Fix up OSXSAVE. */
> -        if ( xsave_enabled(v) )
> +        if ( cpu_has_xsave )
>              *ecx |= (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE) ?
>                       cpufeat_mask(X86_FEATURE_OSXSAVE) : 0;
>  
> --- a/xen/arch/x86/hvm/vmx/vmcs.c
> +++ b/xen/arch/x86/hvm/vmx/vmcs.c
> @@ -944,8 +944,7 @@ static int construct_vmcs(struct vcpu *v
>      /* Host control registers. */
>      v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS;
>      __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
> -    __vmwrite(HOST_CR4,
> -              mmu_cr4_features | (xsave_enabled(v) ? X86_CR4_OSXSAVE : 0));
> +    __vmwrite(HOST_CR4, mmu_cr4_features);
>  
>      /* Host CS:RIP. */
>      __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS);
> --- a/xen/arch/x86/i387.c
> +++ b/xen/arch/x86/i387.c
> @@ -38,14 +38,15 @@ static inline void fpu_xrstor(struct vcp
>  {
>      bool_t ok;
>  
> +    ASSERT(v->arch.xsave_area);
>      /*
>       * XCR0 normally represents what guest OS set. In case of Xen itself,
> -     * we set all supported feature mask before doing save/restore.
> +     * we set the accumulated feature mask before doing save/restore.
>       */
> -    ok = set_xcr0(v->arch.xcr0_accum);
> +    ok = set_xcr0(v->arch.xcr0_accum | XSTATE_FP_SSE);
>      ASSERT(ok);
>      xrstor(v, mask);
> -    ok = set_xcr0(v->arch.xcr0);
> +    ok = set_xcr0(v->arch.xcr0 ?: XSTATE_FP_SSE);
>      ASSERT(ok);
>  }
>  
> @@ -137,13 +138,15 @@ static inline void fpu_xsave(struct vcpu
>  {
>      bool_t ok;
>  
> -    /* XCR0 normally represents what guest OS set. In case of Xen itself,
> -     * we set all accumulated feature mask before doing save/restore.
> +    ASSERT(v->arch.xsave_area);
> +    /*
> +     * XCR0 normally represents what guest OS set. In case of Xen itself,
> +     * we set the accumulated feature mask before doing save/restore.
>       */
> -    ok = set_xcr0(v->arch.xcr0_accum);
> +    ok = set_xcr0(v->arch.xcr0_accum | XSTATE_FP_SSE);
>      ASSERT(ok);
>      xsave(v, v->arch.nonlazy_xstate_used ? XSTATE_ALL : XSTATE_LAZY);
> -    ok = set_xcr0(v->arch.xcr0);
> +    ok = set_xcr0(v->arch.xcr0 ?: XSTATE_FP_SSE);
>      ASSERT(ok);
>  }
>  
> @@ -232,7 +235,7 @@ void vcpu_restore_fpu_lazy(struct vcpu *
>      if ( v->fpu_dirtied )
>          return;
>  
> -    if ( xsave_enabled(v) )
> +    if ( cpu_has_xsave )
>          fpu_xrstor(v, XSTATE_LAZY);
>      else if ( v->fpu_initialised )
>      {
> @@ -262,7 +265,7 @@ void vcpu_save_fpu(struct vcpu *v)
>      /* This can happen, if a paravirtualised guest OS has set its CR0.TS. */
>      clts();
>  
> -    if ( xsave_enabled(v) )
> +    if ( cpu_has_xsave )
>          fpu_xsave(v);
>      else if ( cpu_has_fxsr )
>          fpu_fxsave(v);
> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -817,7 +817,7 @@ static void pv_cpuid(struct cpu_user_reg
>          __clear_bit(X86_FEATURE_PDCM % 32, &c);
>          __clear_bit(X86_FEATURE_PCID % 32, &c);
>          __clear_bit(X86_FEATURE_DCA % 32, &c);
> -        if ( !xsave_enabled(current) )
> +        if ( !cpu_has_xsave )
>          {
>              __clear_bit(X86_FEATURE_XSAVE % 32, &c);
>              __clear_bit(X86_FEATURE_AVX % 32, &c);
> @@ -842,7 +842,7 @@ static void pv_cpuid(struct cpu_user_reg
>          break;
>  
>      case 0x0000000d: /* XSAVE */
> -        if ( !xsave_enabled(current) )
> +        if ( !cpu_has_xsave )
>              goto unsupported;
>          break;
>  
> --- a/xen/arch/x86/xstate.c
> +++ b/xen/arch/x86/xstate.c
> @@ -21,7 +21,7 @@ bool_t __read_mostly cpu_has_xsaveopt;
>   * the supported and enabled features on the processor, including the
>   * XSAVE.HEADER. We only enable XCNTXT_MASK that we have known.
>   */
> -u32 xsave_cntxt_size;
> +static u32 __read_mostly xsave_cntxt_size;
>  
>  /* A 64-bit bitmask of the XSAVE/XRSTOR features supported by processor. */
>  u64 xfeature_mask;
> @@ -206,13 +206,13 @@ void xrstor(struct vcpu *v, uint64_t mas
>  
>  bool_t xsave_enabled(const struct vcpu *v)
>  {
> -    if ( cpu_has_xsave )
> -    {
> -        ASSERT(xsave_cntxt_size >= XSTATE_AREA_MIN_SIZE);
> -        ASSERT(v->arch.xsave_area);
> -    }
> +    if ( !cpu_has_xsave )
> +        return 0;
>  
> -    return cpu_has_xsave; 
> +    ASSERT(xsave_cntxt_size >= XSTATE_AREA_MIN_SIZE);
> +    ASSERT(v->arch.xsave_area);
> +
> +    return !!v->arch.xcr0_accum;
>  }
>  
>  int xstate_alloc_save_area(struct vcpu *v)
> @@ -238,8 +238,8 @@ int xstate_alloc_save_area(struct vcpu *
>      save_area->fpu_sse.mxcsr = MXCSR_DEFAULT;
>  
>      v->arch.xsave_area = save_area;
> -    v->arch.xcr0 = XSTATE_FP_SSE;
> -    v->arch.xcr0_accum = XSTATE_FP_SSE;
> +    v->arch.xcr0 = 0;
> +    v->arch.xcr0_accum = 0;
>  
>      return 0;
>  }
> @@ -257,7 +257,11 @@ void xstate_init(bool_t bsp)
>      u64 feature_mask;
>  
>      if ( boot_cpu_data.cpuid_level < XSTATE_CPUID )
> +    {
> +        BUG_ON(!bsp);
> +        setup_clear_cpu_cap(X86_FEATURE_XSAVE);
>          return;
> +    }
>  
>      cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
>  
> @@ -277,7 +281,6 @@ void xstate_init(bool_t bsp)
>      set_in_cr4(X86_CR4_OSXSAVE);
>      if ( !set_xcr0(feature_mask) )
>          BUG();
> -    cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
>  
>      if ( bsp )
>      {
> @@ -286,14 +289,14 @@ void xstate_init(bool_t bsp)
>           * xsave_cntxt_size is the max size required by enabled features.
>           * We know FP/SSE and YMM about eax, and nothing about edx at
> present.
>           */
> -        xsave_cntxt_size = ebx;
> +        xsave_cntxt_size = xstate_ctxt_size(feature_mask);
>          printk("%s: using cntxt_size: %#x and states: %#"PRIx64"\n",
>              __func__, xsave_cntxt_size, xfeature_mask);
>      }
>      else
>      {
>          BUG_ON(xfeature_mask != feature_mask);
> -        BUG_ON(xsave_cntxt_size != ebx);
> +        BUG_ON(xsave_cntxt_size != xstate_ctxt_size(feature_mask));
>      }
>  
>      /* Check XSAVEOPT feature. */
> @@ -304,6 +307,42 @@ void xstate_init(bool_t bsp)
>          BUG_ON(!cpu_has_xsaveopt != !(eax & XSTATE_FEATURE_XSAVEOPT));
>  }
>  
> +unsigned int xstate_ctxt_size(u64 xcr0)
> +{
> +    u32 ebx = 0;
> +
> +    if ( xcr0 )
> +    {
> +        u64 act_xcr0 = get_xcr0();
> +        u32 eax, ecx, edx;
> +        bool_t ok = set_xcr0(xcr0);
> +
> +        ASSERT(ok);
> +        cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
> +        ASSERT(ebx <= ecx);
> +        ok = set_xcr0(act_xcr0);
> +        ASSERT(ok);
> +    }
> +
> +    return ebx;
> +}
> +
> +int validate_xstate(u64 xcr0, u64 xcr0_accum, u64 xstate_bv, u64 xfeat_mask)
> +{
> +    if ( (xcr0_accum & ~xfeat_mask) ||
> +         (xstate_bv & ~xcr0_accum) ||
> +         (xcr0 & ~xcr0_accum) ||
> +         !(xcr0 & XSTATE_FP) ||
> +         ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) ||
> +         ((xcr0_accum & XSTATE_YMM) && !(xcr0_accum & XSTATE_SSE)) )
> +        return -EINVAL;
> +
> +    if ( xcr0_accum & ~xfeature_mask )
> +        return -EOPNOTSUPP;
> +
> +    return 0;
> +}
> +
>  int handle_xsetbv(u32 index, u64 new_bv)
>  {
>      struct vcpu *curr = current;
> --- a/xen/include/asm-x86/domain.h
> +++ b/xen/include/asm-x86/domain.h
> @@ -456,9 +456,9 @@ unsigned long pv_guest_cr4_fixup(const s
>  #define pv_guest_cr4_to_real_cr4(v)                         \
>      (((v)->arch.pv_vcpu.ctrlreg[4]                          \
>        | (mmu_cr4_features                                   \
> -         & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP))      \
> -      | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0)          \
> -      | ((xsave_enabled(v))? X86_CR4_OSXSAVE : 0))          \
> +         & (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_SMEP |      \
> +            X86_CR4_OSXSAVE))                               \
> +      | ((v)->domain->arch.vtsc ? X86_CR4_TSD : 0))         \
>       & ~X86_CR4_DE)
>  #define real_cr4_to_pv_guest_cr4(c)                         \
>      ((c) & ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_TSD        \
> --- a/xen/include/asm-x86/hvm/hvm.h
> +++ b/xen/include/asm-x86/hvm/hvm.h
> @@ -368,7 +368,7 @@ static inline int hvm_event_pending(stru
>          ((nestedhvm_enabled((_v)->domain) && cpu_has_vmx)\
>                        ? X86_CR4_VMXE : 0)  |             \
>          (cpu_has_pcid ? X86_CR4_PCIDE : 0) |             \
> -        (xsave_enabled(_v) ? X86_CR4_OSXSAVE : 0))))
> +        (cpu_has_xsave ? X86_CR4_OSXSAVE : 0))))
>  
>  /* These exceptions must always be intercepted. */
>  #define HVM_TRAP_MASK ((1U << TRAP_machine_check) | (1U << TRAP_invalid_op))
> --- a/xen/include/asm-x86/xstate.h
> +++ b/xen/include/asm-x86/xstate.h
> @@ -34,7 +34,6 @@
>  #define XSTATE_NONLAZY (XSTATE_LWP)
>  #define XSTATE_LAZY    (XSTATE_ALL & ~XSTATE_NONLAZY)
>  
> -extern unsigned int xsave_cntxt_size;
>  extern u64 xfeature_mask;
>  
>  /* extended state save area */
> @@ -77,11 +76,14 @@ uint64_t get_xcr0(void);
>  void xsave(struct vcpu *v, uint64_t mask);
>  void xrstor(struct vcpu *v, uint64_t mask);
>  bool_t xsave_enabled(const struct vcpu *v);
> +int __must_check validate_xstate(u64 xcr0, u64 xcr0_accum, u64 xstate_bv,
> +                                 u64 xfeat_mask);
>  int __must_check handle_xsetbv(u32 index, u64 new_bv);
>  
>  /* extended state init and cleanup functions */
>  void xstate_free_save_area(struct vcpu *v);
>  int xstate_alloc_save_area(struct vcpu *v);
>  void xstate_init(bool_t bsp);
> +unsigned int xstate_ctxt_size(u64 xcr0);
>  
>  #endif /* __ASM_XSTATE_H */
> 
> 



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.