[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v3 1/5] EPT: force reâevaluation of memory type as necessary



> From: Jan Beulich
> Sent: Monday, April 07, 2014 6:08 PM
> 
> The main goal here is to drop the bogus dependency of
> epte_get_entry_emt() on
> d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT].
> 
> Any change to state influencing epte_get_entry_emt()'s decision needs
> to result in re-calculation. Do this by using the EPT_MISCONFIG VM
> exit, storing an invalid memory type into EPT's emt field (leaving the
> IOMMU, which doesn't care about memory types, unaffected).
> 
> This is being done in a hierarchical manner to keep execution time
> down: Initially only the top level directory gets invalidated this way.
> Upon access, the involved intermediate page table levels get cleared
> back to zero, and the leaf entry gets its field properly set. For 4k
> leaves all other entries in the same directory also get processed to
> amortize the cost of the extra VM exit (which halved the number of
> these VM exits in my testing).
> 
> This restoring can result in spurious EPT_MISCONFIG VM exits (since
> two vCPU-s may access addresses involving identical page table
> structures). Rather than simply returning in such cases (and risking
> that such a VM exit results from a real mis-configuration, which
> would then result in an endless loop rather than killing the VM), a
> per-vCPU flag is being introduced indicating when such a spurious VM
> exit might validly happen - if another one occurs right after VM re-
> entry, the flag would generally end up being clear, causing the VM
> to be killed as before on such VM exits.
> 
> Note that putting a reserved memory type value in the EPT structures
> isn't formally sanctioned by the specification. Intel isn't willing to
> adjust the specification to make this or a similar use of the
> EPT_MISCONFIG VM exit formally possible, but they have indicated that
> us using this is low risk wrt forward compatibility.
> 
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
> Reviewed-by: Tim Deegan <tim@xxxxxxx>
> ---
> v3: Code movement.

any data on improvement against original code?

one small comment is whether it's clearer to define a new macro to
indicate the purpose instead of using MTRR_NUM_TYPES?

Acked-by: Kevin Tian <kevin.tian@xxxxxxxxx>

Thanks
Kevin

> 
> --- a/xen/arch/x86/domctl.c
> +++ b/xen/arch/x86/domctl.c
> @@ -84,6 +84,8 @@ long arch_do_domctl(
>              ret = ioports_permit_access(d, fp, fp + np - 1);
>          else
>              ret = ioports_deny_access(d, fp, fp + np - 1);
> +        if ( !ret )
> +            memory_type_changed(d);
>      }
>      break;
> 
> @@ -705,6 +707,8 @@ long arch_do_domctl(
>                         ret, add ? "removing" : "denying", d->domain_id,
>                         mfn, mfn + nr_mfns - 1);
>          }
> +        /* Do this unconditionally to cover errors on above failure paths. */
> +        memory_type_changed(d);
>      }
>      break;
> 
> @@ -791,6 +795,8 @@ long arch_do_domctl(
>                         "ioport_map: error %ld denying dom%d access to
> [%x,%x]\n",
>                         ret, d->domain_id, fmp, fmp + np - 1);
>          }
> +        if ( !ret )
> +            memory_type_changed(d);
>      }
>      break;
> 
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -252,6 +252,9 @@ int hvm_set_guest_pat(struct vcpu *v, u6
> 
>      if ( !hvm_funcs.set_guest_pat(v, guest_pat) )
>          v->arch.hvm_vcpu.pat_cr = guest_pat;
> +
> +    memory_type_changed(v->domain);
> +
>      return 1;
>  }
> 
> --- a/xen/arch/x86/hvm/mtrr.c
> +++ b/xen/arch/x86/hvm/mtrr.c
> @@ -431,8 +431,12 @@ bool_t mtrr_def_type_msr_set(struct doma
>           return 0;
>      }
> 
> -    m->enabled = enabled;
> -    m->def_type = def_type;
> +    if ( m->enabled != enabled || m->def_type != def_type )
> +    {
> +        m->enabled = enabled;
> +        m->def_type = def_type;
> +        memory_type_changed(d);
> +    }
> 
>      return 1;
>  }
> @@ -452,6 +456,7 @@ bool_t mtrr_fix_range_msr_set(struct dom
>                  return 0;
> 
>          fixed_range_base[row] = msr_content;
> +        memory_type_changed(d);
>      }
> 
>      return 1;
> @@ -496,6 +501,8 @@ bool_t mtrr_var_range_msr_set(
> 
>      m->overlapped = is_var_mtrr_overlapped(m);
> 
> +    memory_type_changed(d);
> +
>      return 1;
>  }
> 
> @@ -690,6 +697,12 @@ static int hvm_load_mtrr_msr(struct doma
>  HVM_REGISTER_SAVE_RESTORE(MTRR, hvm_save_mtrr_msr,
> hvm_load_mtrr_msr,
>                            1, HVMSR_PER_VCPU);
> 
> +void memory_type_changed(struct domain *d)
> +{
> +    if ( iommu_enabled && !iommu_snoop && d->vcpu && d->vcpu[0] )
> +        p2m_memory_type_changed(d);
> +}
> +
>  uint8_t epte_get_entry_emt(struct domain *d, unsigned long gfn, mfn_t mfn,
>                             uint8_t *ipat, bool_t direct_mmio)
>  {
> @@ -733,8 +746,7 @@ uint8_t epte_get_entry_emt(struct domain
>          return MTRR_TYPE_WRBACK;
>      }
> 
> -    gmtrr_mtype = is_hvm_domain(d) && v &&
> -
> d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT] ?
> +    gmtrr_mtype = is_hvm_domain(d) && v ?
>                    get_mtrr_type(&v->arch.hvm_vcpu.mtrr, (gfn <<
> PAGE_SHIFT)) :
>                    MTRR_TYPE_WRBACK;
> 
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -3016,6 +3016,16 @@ void vmx_vmexit_handler(struct cpu_user_
>          break;
>      }
> 
> +    case EXIT_REASON_EPT_MISCONFIG:
> +    {
> +        paddr_t gpa;
> +
> +        __vmread(GUEST_PHYSICAL_ADDRESS, &gpa);
> +        if ( !ept_handle_misconfig(gpa) )
> +            goto exit_and_crash;
> +        break;
> +    }
> +
>      case EXIT_REASON_MONITOR_TRAP_FLAG:
>          v->arch.hvm_vmx.exec_control &=
> ~CPU_BASED_MONITOR_TRAP_FLAG;
>          vmx_update_cpu_exec_control(v);
> --- a/xen/arch/x86/mm/p2m.c
> +++ b/xen/arch/x86/mm/p2m.c
> @@ -200,6 +200,18 @@ void p2m_change_entry_type_global(struct
>      p2m_unlock(p2m);
>  }
> 
> +void p2m_memory_type_changed(struct domain *d)
> +{
> +    struct p2m_domain *p2m = p2m_get_hostp2m(d);
> +
> +    if ( p2m->memory_type_changed )
> +    {
> +        p2m_lock(p2m);
> +        p2m->memory_type_changed(p2m);
> +        p2m_unlock(p2m);
> +    }
> +}
> +
>  mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn,
>                      p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
>                      unsigned int *page_order, bool_t locked)
> --- a/xen/arch/x86/mm/p2m-ept.c
> +++ b/xen/arch/x86/mm/p2m-ept.c
> @@ -270,6 +270,124 @@ static int ept_next_level(struct p2m_dom
>      return GUEST_TABLE_NORMAL_PAGE;
>  }
> 
> +static bool_t ept_invalidate_emt(mfn_t mfn)
> +{
> +    ept_entry_t *epte = map_domain_page(mfn_x(mfn));
> +    unsigned int i;
> +    bool_t changed = 0;
> +
> +    for ( i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
> +    {
> +        ept_entry_t e = atomic_read_ept_entry(&epte[i]);
> +
> +        if ( !is_epte_valid(&e) || !is_epte_present(&e) ||
> +             e.emt == MTRR_NUM_TYPES )
> +            continue;
> +
> +        e.emt = MTRR_NUM_TYPES;
> +        atomic_write_ept_entry(&epte[i], e);
> +        changed = 1;
> +    }
> +
> +    unmap_domain_page(epte);
> +
> +    return changed;
> +}
> +
> +bool_t ept_handle_misconfig(uint64_t gpa)
> +{
> +    struct vcpu *curr = current;
> +    struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
> +    struct ept_data *ept = &p2m->ept;
> +    unsigned int level = ept_get_wl(ept);
> +    unsigned long gfn = PFN_DOWN(gpa);
> +    unsigned long mfn = ept_get_asr(ept);
> +    ept_entry_t *epte;
> +    int okay;
> +
> +    if ( !mfn )
> +        return 0;
> +
> +    p2m_lock(p2m);
> +
> +    okay = -curr->arch.hvm_vmx.ept_spurious_misconfig;
> +    for ( ; ; --level )
> +    {
> +        ept_entry_t e;
> +        unsigned int i;
> +
> +        epte = map_domain_page(mfn);
> +        i = (gfn >> (level * EPT_TABLE_ORDER)) &
> (EPT_PAGETABLE_ENTRIES - 1);
> +        e = atomic_read_ept_entry(&epte[i]);
> +
> +        if ( level == 0 || is_epte_superpage(&e) )
> +        {
> +            uint8_t ipat = 0;
> +
> +            if ( e.emt != MTRR_NUM_TYPES )
> +                break;
> +
> +            if ( level == 0 )
> +            {
> +                for ( gfn -= i, i = 0; i < EPT_PAGETABLE_ENTRIES; ++i )
> +                {
> +                    e = atomic_read_ept_entry(&epte[i]);
> +                    if ( e.emt == MTRR_NUM_TYPES )
> +                        e.emt = 0;
> +                    if ( !is_epte_valid(&e) || !is_epte_present(&e) )
> +                        continue;
> +                    e.emt = epte_get_entry_emt(p2m->domain, gfn + i,
> +                                               _mfn(e.mfn),
> &ipat,
> +                                               e.sa_p2mt ==
> p2m_mmio_direct);
> +                    e.ipat = ipat;
> +                    atomic_write_ept_entry(&epte[i], e);
> +                }
> +            }
> +            else
> +            {
> +                e.emt = epte_get_entry_emt(p2m->domain, gfn,
> _mfn(e.mfn),
> +                                           &ipat,
> +                                           e.sa_p2mt ==
> p2m_mmio_direct);
> +                e.ipat = ipat;
> +                atomic_write_ept_entry(&epte[i], e);
> +            }
> +
> +            okay = 1;
> +            break;
> +        }
> +
> +        if ( e.emt == MTRR_NUM_TYPES )
> +        {
> +            ASSERT(is_epte_present(&e));
> +            ept_invalidate_emt(_mfn(e.mfn));
> +            e.emt = 0;
> +            atomic_write_ept_entry(&epte[i], e);
> +            unmap_domain_page(epte);
> +            okay = 1;
> +        }
> +        else if ( is_epte_present(&e) && !e.emt )
> +            unmap_domain_page(epte);
> +        else
> +            break;
> +
> +        mfn = e.mfn;
> +    }
> +
> +    unmap_domain_page(epte);
> +    if ( okay > 0 )
> +    {
> +        struct vcpu *v;
> +
> +        for_each_vcpu ( curr->domain, v )
> +            v->arch.hvm_vmx.ept_spurious_misconfig = 1;
> +    }
> +    curr->arch.hvm_vmx.ept_spurious_misconfig = 0;
> +    ept_sync_domain(p2m);
> +    p2m_unlock(p2m);
> +
> +    return !!okay;
> +}
> +
>  /*
>   * ept_set_entry() computes 'need_modify_vtd_table' for itself,
>   * by observing whether any gfn->mfn translations are modified.
> @@ -660,6 +778,17 @@ static void ept_change_entry_type_global
>      ept_sync_domain(p2m);
>  }
> 
> +static void ept_memory_type_changed(struct p2m_domain *p2m)
> +{
> +    unsigned long mfn = ept_get_asr(&p2m->ept);
> +
> +    if ( !mfn )
> +        return;
> +
> +    if ( ept_invalidate_emt(_mfn(mfn)) )
> +        ept_sync_domain(p2m);
> +}
> +
>  static void __ept_sync_domain(void *info)
>  {
>      struct ept_data *ept = &((struct p2m_domain *)info)->ept;
> @@ -697,6 +826,7 @@ int ept_p2m_init(struct p2m_domain *p2m)
>      p2m->set_entry = ept_set_entry;
>      p2m->get_entry = ept_get_entry;
>      p2m->change_entry_type_global = ept_change_entry_type_global;
> +    p2m->memory_type_changed = ept_memory_type_changed;
>      p2m->audit_p2m = NULL;
> 
>      /* Set the memory type used when accessing EPT paging structures. */
> @@ -737,6 +867,7 @@ static void ept_dump_p2m_table(unsigned
>          [MTRR_TYPE_WRTHROUGH]      = "WT",
>          [MTRR_TYPE_WRPROT]         = "WP",
>          [MTRR_TYPE_WRBACK]         = "WB",
> +        [MTRR_NUM_TYPES]           = "??"
>      };
> 
>      for_each_domain(d)
> @@ -750,11 +881,16 @@ static void ept_dump_p2m_table(unsigned
> 
>          for ( gfn = 0; gfn <= p2m->max_mapped_pfn; gfn += 1UL << order )
>          {
> +            char c = 0;
> +
>              gfn_remainder = gfn;
>              table =
> map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m)));
> 
>              for ( i = ept_get_wl(ept); i > 0; i-- )
>              {
> +                ept_entry = table + (gfn_remainder >> (i *
> EPT_TABLE_ORDER));
> +                if ( ept_entry->emt == MTRR_NUM_TYPES )
> +                    c = '?';
>                  ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
>                  if ( ret != GUEST_TABLE_NORMAL_PAGE )
>                      break;
> @@ -775,7 +911,7 @@ static void ept_dump_p2m_table(unsigned
>                             memory_types[ept_entry->emt][0],
>                             memory_types[ept_entry->emt][1]
>                             ?: ept_entry->emt + '0',
> -                           ept_entry->ipat ? '!' : ' ');
> +                           c ?: ept_entry->ipat ? '!' : ' ');
> 
>                  if ( !(record_counter++ % 100) )
>                      process_pending_softirqs();
> --- a/xen/common/domctl.c
> +++ b/xen/common/domctl.c
> @@ -815,6 +815,10 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
>              ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1);
>          else
>              ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1);
> +#ifdef CONFIG_X86
> +        if ( !ret )
> +            memory_type_changed(d);
> +#endif
>      }
>      break;
> 
> --- a/xen/include/asm-x86/hvm/vmx/vmcs.h
> +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
> @@ -124,6 +124,9 @@ struct arch_vmx_struct {
> 
>      unsigned long        host_cr0;
> 
> +    /* Do we need to tolerate a spurious EPT_MISCONFIG VM exit? */
> +    bool_t               ept_spurious_misconfig;
> +
>      /* Is the guest in real mode? */
>      uint8_t              vmx_realmode;
>      /* Are we emulating rather than VMENTERing? */
> --- a/xen/include/asm-x86/hvm/vmx/vmx.h
> +++ b/xen/include/asm-x86/hvm/vmx/vmx.h
> @@ -520,6 +520,7 @@ int ept_p2m_init(struct p2m_domain *p2m)
>  void ept_p2m_uninit(struct p2m_domain *p2m);
> 
>  void ept_walk_table(struct domain *d, unsigned long gfn);
> +bool_t ept_handle_misconfig(uint64_t gpa);
>  void setup_ept_dump(void);
> 
>  void update_guest_eip(void);
> --- a/xen/include/asm-x86/mtrr.h
> +++ b/xen/include/asm-x86/mtrr.h
> @@ -88,6 +88,7 @@ extern bool_t mtrr_fix_range_msr_set(str
>                                       uint32_t row, uint64_t
> msr_content);
>  extern bool_t mtrr_def_type_msr_set(struct domain *, struct mtrr_state *,
>                                      uint64_t msr_content);
> +extern void memory_type_changed(struct domain *);
>  extern bool_t pat_msr_set(uint64_t *pat, uint64_t msr);
> 
>  bool_t is_var_mtrr_overlapped(struct mtrr_state *m);
> --- a/xen/include/asm-x86/p2m.h
> +++ b/xen/include/asm-x86/p2m.h
> @@ -233,6 +233,7 @@ struct p2m_domain {
>      void               (*change_entry_type_global)(struct p2m_domain
> *p2m,
>                                                     p2m_type_t
> ot,
>                                                     p2m_type_t
> nt);
> +    void               (*memory_type_changed)(struct p2m_domain
> *p2m);
> 
>      void               (*write_p2m_entry)(struct p2m_domain *p2m,
>                                            unsigned long gfn,
> l1_pgentry_t *p,
> @@ -506,6 +507,9 @@ void p2m_change_type_range(struct domain
>  p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn,
>                             p2m_type_t ot, p2m_type_t nt);
> 
> +/* Report a change affecting memory types. */
> +void p2m_memory_type_changed(struct domain *d);
> +
>  /* Set mmio addresses in the p2m table (for pass-through) */
>  int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
>  int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn);
> 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.