[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] Ping: [PATCH v3 3/4] x86/HVM: implement memory read caching



>>> On 25.09.18 at 16:25, <JBeulich@xxxxxxxx> wrote:
> Emulation requiring device model assistance uses a form of instruction
> re-execution, assuming that the second (and any further) pass takes
> exactly the same path. This is a valid assumption as far use of CPU
> registers goes (as those can't change without any other instruction
> executing in between), but is wrong for memory accesses. In particular
> it has been observed that Windows might page out buffers underneath an
> instruction currently under emulation (hitting between two passes). If
> the first pass translated a linear address successfully, any subsequent
> pass needs to do so too, yielding the exact same translation.
> 
> Introduce a cache (used by just guest page table accesses for now) to
> make sure above described assumption holds. This is a very simplistic
> implementation for now: Only exact matches are satisfied (no overlaps or
> partial reads or anything).
> 
> As to the actual data page in this scenario, there are a couple of
> aspects to take into consideration:
> - We must be talking about an insn accessing two locations (two memory
>   ones, one of which is MMIO, or a memory and an I/O one).
> - If the non I/O / MMIO side is being read, the re-read (if it occurs at
>   all) is having its result discarded, by taking the shortcut through
>   the first switch()'s STATE_IORESP_READY case in hvmemul_do_io(). Note
>   how, among all the re-issue sanity checks there, we avoid comparing
>   the actual data.
> - If the non I/O / MMIO side is being written, it is the OSes
>   responsibility to avoid actually moving page contents to disk while
>   there might still be a write access in flight - this is no different
>   in behavior from bare hardware.
> - Read-modify-write accesses are, as always, complicated, and while we
>   deal with them better nowadays than we did in the past, we're still
>   not quite there to guarantee hardware like behavior in all cases
>   anyway. Nothing is getting worse by the changes made here, afaict.
> 
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
> Acked-by: Tim Deegan <tim@xxxxxxx>
> Reviewed-by: Paul Durrant <paul.durrant@xxxxxxxxxx>

SVM and VMX maintainers?

Thanks, Jan

> ---
> v3: Add text about the actual data page to the description.
> v2: Re-base.
> 
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -27,6 +27,18 @@
>  #include <asm/hvm/svm/svm.h>
>  #include <asm/vm_event.h>
>  
> +struct hvmemul_cache
> +{
> +    unsigned int max_ents;
> +    unsigned int num_ents;
> +    struct {
> +        paddr_t gpa:PADDR_BITS;
> +        unsigned int size:(BITS_PER_LONG - PADDR_BITS) / 2;
> +        unsigned int level:(BITS_PER_LONG - PADDR_BITS) / 2;
> +        unsigned long data;
> +    } ents[];
> +};
> +
>  static void hvmtrace_io_assist(const ioreq_t *p)
>  {
>      unsigned int size, event;
> @@ -541,7 +553,7 @@ static int hvmemul_do_mmio_addr(paddr_t
>   */
>  static void *hvmemul_map_linear_addr(
>      unsigned long linear, unsigned int bytes, uint32_t pfec,
> -    struct hvm_emulate_ctxt *hvmemul_ctxt)
> +    struct hvm_emulate_ctxt *hvmemul_ctxt, struct hvmemul_cache *cache)
>  {
>      struct vcpu *curr = current;
>      void *err, *mapping;
> @@ -586,7 +598,7 @@ static void *hvmemul_map_linear_addr(
>          ASSERT(mfn_x(*mfn) == 0);
>  
>          res = hvm_translate_get_page(curr, addr, true, pfec,
> -                                     &pfinfo, &page, NULL, &p2mt);
> +                                     &pfinfo, &page, NULL, &p2mt, cache);
>  
>          switch ( res )
>          {
> @@ -702,6 +714,8 @@ static int hvmemul_linear_to_phys(
>      gfn_t gfn, ngfn;
>      unsigned long done, todo, i, offset = addr & ~PAGE_MASK;
>      int reverse;
> +    struct hvmemul_cache *cache = pfec & PFEC_insn_fetch
> +                                  ? NULL : curr->arch.hvm.data_cache;
>  
>      /*
>       * Clip repetitions to a sensible maximum. This avoids extensive 
> looping in
> @@ -731,7 +745,7 @@ static int hvmemul_linear_to_phys(
>              return rc;
>          gfn = gaddr_to_gfn(gaddr);
>      }
> -    else if ( gfn_eq(gfn = paging_gla_to_gfn(curr, addr, &pfec, NULL),
> +    else if ( gfn_eq(gfn = paging_gla_to_gfn(curr, addr, &pfec, cache),
>                       INVALID_GFN) )
>      {
>          if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
> @@ -747,7 +761,7 @@ static int hvmemul_linear_to_phys(
>      {
>          /* Get the next PFN in the range. */
>          addr += reverse ? -PAGE_SIZE : PAGE_SIZE;
> -        ngfn = paging_gla_to_gfn(curr, addr, &pfec, NULL);
> +        ngfn = paging_gla_to_gfn(curr, addr, &pfec, cache);
>  
>          /* Is it contiguous with the preceding PFNs? If not then we're 
> done. */
>          if ( gfn_eq(ngfn, INVALID_GFN) ||
> @@ -1073,7 +1087,10 @@ static int linear_read(unsigned long add
>                         uint32_t pfec, struct hvm_emulate_ctxt 
> *hvmemul_ctxt)
>  {
>      pagefault_info_t pfinfo;
> -    int rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo);
> +    int rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo,
> +                                        (pfec & PFEC_insn_fetch
> +                                         ? NULL
> +                                         : current->arch.hvm.data_cache));
>  
>      switch ( rc )
>      {
> @@ -1270,7 +1287,8 @@ static int hvmemul_write(
>  
>      if ( !known_gla(addr, bytes, pfec) )
>      {
> -        mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
> +        mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt,
> +                                          current->arch.hvm.data_cache);
>          if ( IS_ERR(mapping) )
>               return ~PTR_ERR(mapping);
>      }
> @@ -1312,7 +1330,8 @@ static int hvmemul_rmw(
>  
>      if ( !known_gla(addr, bytes, pfec) )
>      {
> -        mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
> +        mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt,
> +                                          current->arch.hvm.data_cache);
>          if ( IS_ERR(mapping) )
>              return ~PTR_ERR(mapping);
>      }
> @@ -1466,7 +1485,8 @@ static int hvmemul_cmpxchg(
>      else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
>          pfec |= PFEC_user_mode;
>  
> -    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
> +    mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt,
> +                                      curr->arch.hvm.data_cache);
>      if ( IS_ERR(mapping) )
>          return ~PTR_ERR(mapping);
>  
> @@ -2373,6 +2393,7 @@ static int _hvm_emulate_one(struct hvm_e
>      {
>          vio->mmio_cache_count = 0;
>          vio->mmio_insn_bytes = 0;
> +        curr->arch.hvm.data_cache->num_ents = 0;
>      }
>      else
>      {
> @@ -2591,7 +2612,7 @@ void hvm_emulate_init_per_insn(
>                                          &addr) &&
>               hvm_copy_from_guest_linear(hvmemul_ctxt->insn_buf, addr,
>                                          sizeof(hvmemul_ctxt->insn_buf),
> -                                        pfec | PFEC_insn_fetch,
> +                                        pfec | PFEC_insn_fetch, NULL,
>                                          NULL) == HVMTRANS_okay) ?
>              sizeof(hvmemul_ctxt->insn_buf) : 0;
>      }
> @@ -2664,9 +2685,35 @@ void hvm_dump_emulation_state(const char
>             hvmemul_ctxt->insn_buf);
>  }
>  
> +struct hvmemul_cache *hvmemul_cache_init(unsigned int nents)
> +{
> +    struct hvmemul_cache *cache = xmalloc_bytes(offsetof(struct 
> hvmemul_cache,
> +                                                         ents[nents]));
> +
> +    if ( cache )
> +    {
> +        cache->num_ents = 0;
> +        cache->max_ents = nents;
> +    }
> +
> +    return cache;
> +}
> +
>  bool hvmemul_read_cache(const struct hvmemul_cache *cache, paddr_t gpa,
>                          unsigned int level, void *buffer, unsigned int 
> size)
>  {
> +    unsigned int i;
> +
> +    ASSERT(size <= sizeof(cache->ents->data));
> +
> +    for ( i = 0; i < cache->num_ents; ++i )
> +        if ( cache->ents[i].level == level && cache->ents[i].gpa == gpa &&
> +             cache->ents[i].size == size )
> +        {
> +            memcpy(buffer, &cache->ents[i].data, size);
> +            return true;
> +        }
> +
>      return false;
>  }
>  
> @@ -2674,6 +2721,35 @@ void hvmemul_write_cache(struct hvmemul_
>                           unsigned int level, const void *buffer,
>                           unsigned int size)
>  {
> +    unsigned int i;
> +
> +    if ( size > sizeof(cache->ents->data) )
> +    {
> +        ASSERT_UNREACHABLE();
> +        return;
> +    }
> +
> +    for ( i = 0; i < cache->num_ents; ++i )
> +        if ( cache->ents[i].level == level && cache->ents[i].gpa == gpa &&
> +             cache->ents[i].size == size )
> +        {
> +            memcpy(&cache->ents[i].data, buffer, size);
> +            return;
> +        }
> +
> +    if ( unlikely(i >= cache->max_ents) )
> +    {
> +        ASSERT_UNREACHABLE();
> +        return;
> +    }
> +
> +    cache->ents[i].level = level;
> +    cache->ents[i].gpa   = gpa;
> +    cache->ents[i].size  = size;
> +
> +    memcpy(&cache->ents[i].data, buffer, size);
> +
> +    cache->num_ents = i + 1;
>  }
>  
>  /*
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -1498,6 +1498,17 @@ int hvm_vcpu_initialise(struct vcpu *v)
>  
>      v->arch.hvm.inject_event.vector = HVM_EVENT_VECTOR_UNSET;
>  
> +    /*
> +     * Leaving aside the insn fetch, for which we don't use this cache, no
> +     * insn can access more than 8 independent linear addresses (AVX2
> +     * gathers being the worst). Each such linear range can span a page
> +     * boundary, i.e. require two page walks.
> +     */
> +    v->arch.hvm.data_cache = hvmemul_cache_init(CONFIG_PAGING_LEVELS * 8 * 
> 2);
> +    rc = -ENOMEM;
> +    if ( !v->arch.hvm.data_cache )
> +        goto fail4;
> +
>      rc = setup_compat_arg_xlat(v); /* teardown: free_compat_arg_xlat() */
>      if ( rc != 0 )
>          goto fail4;
> @@ -1527,6 +1538,7 @@ int hvm_vcpu_initialise(struct vcpu *v)
>   fail5:
>      free_compat_arg_xlat(v);
>   fail4:
> +    hvmemul_cache_destroy(v->arch.hvm.data_cache);
>      hvm_funcs.vcpu_destroy(v);
>   fail3:
>      vlapic_destroy(v);
> @@ -1549,6 +1561,8 @@ void hvm_vcpu_destroy(struct vcpu *v)
>  
>      free_compat_arg_xlat(v);
>  
> +    hvmemul_cache_destroy(v->arch.hvm.data_cache);
> +
>      tasklet_kill(&v->arch.hvm.assert_evtchn_irq_tasklet);
>      hvm_funcs.vcpu_destroy(v);
>  
> @@ -2923,7 +2937,7 @@ void hvm_task_switch(
>      }
>  
>      rc = hvm_copy_from_guest_linear(
> -        &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo);
> +        &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo, NULL);
>      if ( rc == HVMTRANS_bad_linear_to_gfn )
>          hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
>      if ( rc != HVMTRANS_okay )
> @@ -2970,7 +2984,7 @@ void hvm_task_switch(
>          goto out;
>  
>      rc = hvm_copy_from_guest_linear(
> -        &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo);
> +        &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo, NULL);
>      if ( rc == HVMTRANS_bad_linear_to_gfn )
>          hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
>      /*
> @@ -3081,7 +3095,7 @@ void hvm_task_switch(
>  enum hvm_translation_result hvm_translate_get_page(
>      struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec,
>      pagefault_info_t *pfinfo, struct page_info **page_p,
> -    gfn_t *gfn_p, p2m_type_t *p2mt_p)
> +    gfn_t *gfn_p, p2m_type_t *p2mt_p, struct hvmemul_cache *cache)
>  {
>      struct page_info *page;
>      p2m_type_t p2mt;
> @@ -3089,7 +3103,7 @@ enum hvm_translation_result hvm_translat
>  
>      if ( linear )
>      {
> -        gfn = paging_gla_to_gfn(v, addr, &pfec, NULL);
> +        gfn = paging_gla_to_gfn(v, addr, &pfec, cache);
>  
>          if ( gfn_eq(gfn, INVALID_GFN) )
>          {
> @@ -3161,7 +3175,7 @@ enum hvm_translation_result hvm_translat
>  #define HVMCOPY_linear     (1u<<2)
>  static enum hvm_translation_result __hvm_copy(
>      void *buf, paddr_t addr, int size, struct vcpu *v, unsigned int flags,
> -    uint32_t pfec, pagefault_info_t *pfinfo)
> +    uint32_t pfec, pagefault_info_t *pfinfo, struct hvmemul_cache *cache)
>  {
>      gfn_t gfn;
>      struct page_info *page;
> @@ -3194,8 +3208,8 @@ static enum hvm_translation_result __hvm
>  
>          count = min_t(int, PAGE_SIZE - gpa, todo);
>  
> -        res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear,
> -                                     pfec, pfinfo, &page, &gfn, &p2mt);
> +        res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear, pfec,
> +                                     pfinfo, &page, &gfn, &p2mt, cache);
>          if ( res != HVMTRANS_okay )
>              return res;
>  
> @@ -3242,14 +3256,14 @@ enum hvm_translation_result hvm_copy_to_
>      paddr_t paddr, void *buf, int size, struct vcpu *v)
>  {
>      return __hvm_copy(buf, paddr, size, v,
> -                      HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL);
> +                      HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL, NULL);
>  }
>  
>  enum hvm_translation_result hvm_copy_from_guest_phys(
>      void *buf, paddr_t paddr, int size)
>  {
>      return __hvm_copy(buf, paddr, size, current,
> -                      HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL);
> +                      HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL, NULL);
>  }
>  
>  enum hvm_translation_result hvm_copy_to_guest_linear(
> @@ -3258,16 +3272,17 @@ enum hvm_translation_result hvm_copy_to_
>  {
>      return __hvm_copy(buf, addr, size, current,
>                        HVMCOPY_to_guest | HVMCOPY_linear,
> -                      PFEC_page_present | PFEC_write_access | pfec, 
> pfinfo);
> +                      PFEC_page_present | PFEC_write_access | pfec,
> +                      pfinfo, NULL);
>  }
>  
>  enum hvm_translation_result hvm_copy_from_guest_linear(
>      void *buf, unsigned long addr, int size, uint32_t pfec,
> -    pagefault_info_t *pfinfo)
> +    pagefault_info_t *pfinfo, struct hvmemul_cache *cache)
>  {
>      return __hvm_copy(buf, addr, size, current,
>                        HVMCOPY_from_guest | HVMCOPY_linear,
> -                      PFEC_page_present | pfec, pfinfo);
> +                      PFEC_page_present | pfec, pfinfo, cache);
>  }
>  
>  unsigned long copy_to_user_hvm(void *to, const void *from, unsigned int 
> len)
> @@ -3308,7 +3323,8 @@ unsigned long copy_from_user_hvm(void *t
>          return 0;
>      }
>  
> -    rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len, 0, NULL);
> +    rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len,
> +                                    0, NULL, NULL);
>      return rc ? len : 0; /* fake a copy_from_user() return code */
>  }
>  
> @@ -3724,7 +3740,7 @@ void hvm_ud_intercept(struct cpu_user_re
>                                          sizeof(sig), hvm_access_insn_fetch,
>                                          cs, &addr) &&
>               (hvm_copy_from_guest_linear(sig, addr, sizeof(sig),
> -                                         walk, NULL) == HVMTRANS_okay) &&
> +                                         walk, NULL, NULL) == 
> HVMTRANS_okay) &&
>               (memcmp(sig, "\xf\xbxen", sizeof(sig)) == 0) )
>          {
>              regs->rip += sizeof(sig);
> --- a/xen/arch/x86/hvm/svm/svm.c
> +++ b/xen/arch/x86/hvm/svm/svm.c
> @@ -1358,7 +1358,7 @@ static void svm_emul_swint_injection(str
>          goto raise_exception;
>  
>      rc = hvm_copy_from_guest_linear(&idte, idte_linear_addr, idte_size,
> -                                    PFEC_implicit, &pfinfo);
> +                                    PFEC_implicit, &pfinfo, NULL);
>      if ( rc )
>      {
>          if ( rc == HVMTRANS_bad_linear_to_gfn )
> --- a/xen/arch/x86/hvm/vmx/vvmx.c
> +++ b/xen/arch/x86/hvm/vmx/vvmx.c
> @@ -475,7 +475,7 @@ static int decode_vmx_inst(struct cpu_us
>          {
>              pagefault_info_t pfinfo;
>              int rc = hvm_copy_from_guest_linear(poperandS, base, size,
> -                                                0, &pfinfo);
> +                                                0, &pfinfo, NULL);
>  
>              if ( rc == HVMTRANS_bad_linear_to_gfn )
>                  hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
> --- a/xen/arch/x86/mm/shadow/common.c
> +++ b/xen/arch/x86/mm/shadow/common.c
> @@ -166,7 +166,7 @@ const struct x86_emulate_ops *shadow_ini
>              hvm_access_insn_fetch, sh_ctxt, &addr) &&
>           !hvm_copy_from_guest_linear(
>               sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf),
> -             PFEC_insn_fetch, NULL))
> +             PFEC_insn_fetch, NULL, NULL))
>          ? sizeof(sh_ctxt->insn_buf) : 0;
>  
>      return &hvm_shadow_emulator_ops;
> @@ -201,7 +201,7 @@ void shadow_continue_emulation(struct sh
>                  hvm_access_insn_fetch, sh_ctxt, &addr) &&
>               !hvm_copy_from_guest_linear(
>                   sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf),
> -                 PFEC_insn_fetch, NULL))
> +                 PFEC_insn_fetch, NULL, NULL))
>              ? sizeof(sh_ctxt->insn_buf) : 0;
>          sh_ctxt->insn_buf_eip = regs->rip;
>      }
> --- a/xen/arch/x86/mm/shadow/hvm.c
> +++ b/xen/arch/x86/mm/shadow/hvm.c
> @@ -125,7 +125,7 @@ hvm_read(enum x86_segment seg,
>      rc = hvm_copy_from_guest_linear(p_data, addr, bytes,
>                                      (access_type == hvm_access_insn_fetch
>                                       ? PFEC_insn_fetch : 0),
> -                                    &pfinfo);
> +                                    &pfinfo, NULL);
>  
>      switch ( rc )
>      {
> --- a/xen/include/asm-x86/hvm/emulate.h
> +++ b/xen/include/asm-x86/hvm/emulate.h
> @@ -99,6 +99,11 @@ int hvmemul_do_pio_buffer(uint16_t port,
>                            void *buffer);
>  
>  struct hvmemul_cache;
> +struct hvmemul_cache *hvmemul_cache_init(unsigned int nents);
> +static inline void hvmemul_cache_destroy(struct hvmemul_cache *cache)
> +{
> +    xfree(cache);
> +}
>  bool hvmemul_read_cache(const struct hvmemul_cache *, paddr_t gpa,
>                          unsigned int level, void *buffer, unsigned int 
> size);
>  void hvmemul_write_cache(struct hvmemul_cache *, paddr_t gpa,
> --- a/xen/include/asm-x86/hvm/support.h
> +++ b/xen/include/asm-x86/hvm/support.h
> @@ -99,7 +99,7 @@ enum hvm_translation_result hvm_copy_to_
>      pagefault_info_t *pfinfo);
>  enum hvm_translation_result hvm_copy_from_guest_linear(
>      void *buf, unsigned long addr, int size, uint32_t pfec,
> -    pagefault_info_t *pfinfo);
> +    pagefault_info_t *pfinfo, struct hvmemul_cache *cache);
>  
>  /*
>   * Get a reference on the page under an HVM physical or linear address.  If
> @@ -110,7 +110,7 @@ enum hvm_translation_result hvm_copy_fro
>  enum hvm_translation_result hvm_translate_get_page(
>      struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec,
>      pagefault_info_t *pfinfo, struct page_info **page_p,
> -    gfn_t *gfn_p, p2m_type_t *p2mt_p);
> +    gfn_t *gfn_p, p2m_type_t *p2mt_p, struct hvmemul_cache *cache);
>  
>  #define HVM_HCALL_completed  0 /* hypercall completed - no further action 
> */
>  #define HVM_HCALL_preempted  1 /* hypercall preempted - re-execute VMCALL 
> */
> --- a/xen/include/asm-x86/hvm/vcpu.h
> +++ b/xen/include/asm-x86/hvm/vcpu.h
> @@ -53,8 +53,6 @@ struct hvm_mmio_cache {
>      uint8_t buffer[32];
>  };
>  
> -struct hvmemul_cache;
> -
>  struct hvm_vcpu_io {
>      /* I/O request in flight to device model. */
>      enum hvm_io_completion io_completion;
> @@ -200,6 +198,7 @@ struct hvm_vcpu {
>      u8                  cache_mode;
>  
>      struct hvm_vcpu_io  hvm_io;
> +    struct hvmemul_cache *data_cache;
>  
>      /* Pending hw/sw interrupt (.vector = -1 means nothing pending). */
>      struct x86_event     inject_event;
> 
> 
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxxxxxxxxx 
> https://lists.xenproject.org/mailman/listinfo/xen-devel 




_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.