|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] Ping: [PATCH v3 3/4] x86/HVM: implement memory read caching
>>> On 25.09.18 at 16:25, <JBeulich@xxxxxxxx> wrote:
> Emulation requiring device model assistance uses a form of instruction
> re-execution, assuming that the second (and any further) pass takes
> exactly the same path. This is a valid assumption as far use of CPU
> registers goes (as those can't change without any other instruction
> executing in between), but is wrong for memory accesses. In particular
> it has been observed that Windows might page out buffers underneath an
> instruction currently under emulation (hitting between two passes). If
> the first pass translated a linear address successfully, any subsequent
> pass needs to do so too, yielding the exact same translation.
>
> Introduce a cache (used by just guest page table accesses for now) to
> make sure above described assumption holds. This is a very simplistic
> implementation for now: Only exact matches are satisfied (no overlaps or
> partial reads or anything).
>
> As to the actual data page in this scenario, there are a couple of
> aspects to take into consideration:
> - We must be talking about an insn accessing two locations (two memory
> ones, one of which is MMIO, or a memory and an I/O one).
> - If the non I/O / MMIO side is being read, the re-read (if it occurs at
> all) is having its result discarded, by taking the shortcut through
> the first switch()'s STATE_IORESP_READY case in hvmemul_do_io(). Note
> how, among all the re-issue sanity checks there, we avoid comparing
> the actual data.
> - If the non I/O / MMIO side is being written, it is the OSes
> responsibility to avoid actually moving page contents to disk while
> there might still be a write access in flight - this is no different
> in behavior from bare hardware.
> - Read-modify-write accesses are, as always, complicated, and while we
> deal with them better nowadays than we did in the past, we're still
> not quite there to guarantee hardware like behavior in all cases
> anyway. Nothing is getting worse by the changes made here, afaict.
>
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
> Acked-by: Tim Deegan <tim@xxxxxxx>
> Reviewed-by: Paul Durrant <paul.durrant@xxxxxxxxxx>
SVM and VMX maintainers?
Thanks, Jan
> ---
> v3: Add text about the actual data page to the description.
> v2: Re-base.
>
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -27,6 +27,18 @@
> #include <asm/hvm/svm/svm.h>
> #include <asm/vm_event.h>
>
> +struct hvmemul_cache
> +{
> + unsigned int max_ents;
> + unsigned int num_ents;
> + struct {
> + paddr_t gpa:PADDR_BITS;
> + unsigned int size:(BITS_PER_LONG - PADDR_BITS) / 2;
> + unsigned int level:(BITS_PER_LONG - PADDR_BITS) / 2;
> + unsigned long data;
> + } ents[];
> +};
> +
> static void hvmtrace_io_assist(const ioreq_t *p)
> {
> unsigned int size, event;
> @@ -541,7 +553,7 @@ static int hvmemul_do_mmio_addr(paddr_t
> */
> static void *hvmemul_map_linear_addr(
> unsigned long linear, unsigned int bytes, uint32_t pfec,
> - struct hvm_emulate_ctxt *hvmemul_ctxt)
> + struct hvm_emulate_ctxt *hvmemul_ctxt, struct hvmemul_cache *cache)
> {
> struct vcpu *curr = current;
> void *err, *mapping;
> @@ -586,7 +598,7 @@ static void *hvmemul_map_linear_addr(
> ASSERT(mfn_x(*mfn) == 0);
>
> res = hvm_translate_get_page(curr, addr, true, pfec,
> - &pfinfo, &page, NULL, &p2mt);
> + &pfinfo, &page, NULL, &p2mt, cache);
>
> switch ( res )
> {
> @@ -702,6 +714,8 @@ static int hvmemul_linear_to_phys(
> gfn_t gfn, ngfn;
> unsigned long done, todo, i, offset = addr & ~PAGE_MASK;
> int reverse;
> + struct hvmemul_cache *cache = pfec & PFEC_insn_fetch
> + ? NULL : curr->arch.hvm.data_cache;
>
> /*
> * Clip repetitions to a sensible maximum. This avoids extensive
> looping in
> @@ -731,7 +745,7 @@ static int hvmemul_linear_to_phys(
> return rc;
> gfn = gaddr_to_gfn(gaddr);
> }
> - else if ( gfn_eq(gfn = paging_gla_to_gfn(curr, addr, &pfec, NULL),
> + else if ( gfn_eq(gfn = paging_gla_to_gfn(curr, addr, &pfec, cache),
> INVALID_GFN) )
> {
> if ( pfec & (PFEC_page_paged | PFEC_page_shared) )
> @@ -747,7 +761,7 @@ static int hvmemul_linear_to_phys(
> {
> /* Get the next PFN in the range. */
> addr += reverse ? -PAGE_SIZE : PAGE_SIZE;
> - ngfn = paging_gla_to_gfn(curr, addr, &pfec, NULL);
> + ngfn = paging_gla_to_gfn(curr, addr, &pfec, cache);
>
> /* Is it contiguous with the preceding PFNs? If not then we're
> done. */
> if ( gfn_eq(ngfn, INVALID_GFN) ||
> @@ -1073,7 +1087,10 @@ static int linear_read(unsigned long add
> uint32_t pfec, struct hvm_emulate_ctxt
> *hvmemul_ctxt)
> {
> pagefault_info_t pfinfo;
> - int rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo);
> + int rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo,
> + (pfec & PFEC_insn_fetch
> + ? NULL
> + : current->arch.hvm.data_cache));
>
> switch ( rc )
> {
> @@ -1270,7 +1287,8 @@ static int hvmemul_write(
>
> if ( !known_gla(addr, bytes, pfec) )
> {
> - mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
> + mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt,
> + current->arch.hvm.data_cache);
> if ( IS_ERR(mapping) )
> return ~PTR_ERR(mapping);
> }
> @@ -1312,7 +1330,8 @@ static int hvmemul_rmw(
>
> if ( !known_gla(addr, bytes, pfec) )
> {
> - mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
> + mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt,
> + current->arch.hvm.data_cache);
> if ( IS_ERR(mapping) )
> return ~PTR_ERR(mapping);
> }
> @@ -1466,7 +1485,8 @@ static int hvmemul_cmpxchg(
> else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
> pfec |= PFEC_user_mode;
>
> - mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt);
> + mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt,
> + curr->arch.hvm.data_cache);
> if ( IS_ERR(mapping) )
> return ~PTR_ERR(mapping);
>
> @@ -2373,6 +2393,7 @@ static int _hvm_emulate_one(struct hvm_e
> {
> vio->mmio_cache_count = 0;
> vio->mmio_insn_bytes = 0;
> + curr->arch.hvm.data_cache->num_ents = 0;
> }
> else
> {
> @@ -2591,7 +2612,7 @@ void hvm_emulate_init_per_insn(
> &addr) &&
> hvm_copy_from_guest_linear(hvmemul_ctxt->insn_buf, addr,
> sizeof(hvmemul_ctxt->insn_buf),
> - pfec | PFEC_insn_fetch,
> + pfec | PFEC_insn_fetch, NULL,
> NULL) == HVMTRANS_okay) ?
> sizeof(hvmemul_ctxt->insn_buf) : 0;
> }
> @@ -2664,9 +2685,35 @@ void hvm_dump_emulation_state(const char
> hvmemul_ctxt->insn_buf);
> }
>
> +struct hvmemul_cache *hvmemul_cache_init(unsigned int nents)
> +{
> + struct hvmemul_cache *cache = xmalloc_bytes(offsetof(struct
> hvmemul_cache,
> + ents[nents]));
> +
> + if ( cache )
> + {
> + cache->num_ents = 0;
> + cache->max_ents = nents;
> + }
> +
> + return cache;
> +}
> +
> bool hvmemul_read_cache(const struct hvmemul_cache *cache, paddr_t gpa,
> unsigned int level, void *buffer, unsigned int
> size)
> {
> + unsigned int i;
> +
> + ASSERT(size <= sizeof(cache->ents->data));
> +
> + for ( i = 0; i < cache->num_ents; ++i )
> + if ( cache->ents[i].level == level && cache->ents[i].gpa == gpa &&
> + cache->ents[i].size == size )
> + {
> + memcpy(buffer, &cache->ents[i].data, size);
> + return true;
> + }
> +
> return false;
> }
>
> @@ -2674,6 +2721,35 @@ void hvmemul_write_cache(struct hvmemul_
> unsigned int level, const void *buffer,
> unsigned int size)
> {
> + unsigned int i;
> +
> + if ( size > sizeof(cache->ents->data) )
> + {
> + ASSERT_UNREACHABLE();
> + return;
> + }
> +
> + for ( i = 0; i < cache->num_ents; ++i )
> + if ( cache->ents[i].level == level && cache->ents[i].gpa == gpa &&
> + cache->ents[i].size == size )
> + {
> + memcpy(&cache->ents[i].data, buffer, size);
> + return;
> + }
> +
> + if ( unlikely(i >= cache->max_ents) )
> + {
> + ASSERT_UNREACHABLE();
> + return;
> + }
> +
> + cache->ents[i].level = level;
> + cache->ents[i].gpa = gpa;
> + cache->ents[i].size = size;
> +
> + memcpy(&cache->ents[i].data, buffer, size);
> +
> + cache->num_ents = i + 1;
> }
>
> /*
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -1498,6 +1498,17 @@ int hvm_vcpu_initialise(struct vcpu *v)
>
> v->arch.hvm.inject_event.vector = HVM_EVENT_VECTOR_UNSET;
>
> + /*
> + * Leaving aside the insn fetch, for which we don't use this cache, no
> + * insn can access more than 8 independent linear addresses (AVX2
> + * gathers being the worst). Each such linear range can span a page
> + * boundary, i.e. require two page walks.
> + */
> + v->arch.hvm.data_cache = hvmemul_cache_init(CONFIG_PAGING_LEVELS * 8 *
> 2);
> + rc = -ENOMEM;
> + if ( !v->arch.hvm.data_cache )
> + goto fail4;
> +
> rc = setup_compat_arg_xlat(v); /* teardown: free_compat_arg_xlat() */
> if ( rc != 0 )
> goto fail4;
> @@ -1527,6 +1538,7 @@ int hvm_vcpu_initialise(struct vcpu *v)
> fail5:
> free_compat_arg_xlat(v);
> fail4:
> + hvmemul_cache_destroy(v->arch.hvm.data_cache);
> hvm_funcs.vcpu_destroy(v);
> fail3:
> vlapic_destroy(v);
> @@ -1549,6 +1561,8 @@ void hvm_vcpu_destroy(struct vcpu *v)
>
> free_compat_arg_xlat(v);
>
> + hvmemul_cache_destroy(v->arch.hvm.data_cache);
> +
> tasklet_kill(&v->arch.hvm.assert_evtchn_irq_tasklet);
> hvm_funcs.vcpu_destroy(v);
>
> @@ -2923,7 +2937,7 @@ void hvm_task_switch(
> }
>
> rc = hvm_copy_from_guest_linear(
> - &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo);
> + &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo, NULL);
> if ( rc == HVMTRANS_bad_linear_to_gfn )
> hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
> if ( rc != HVMTRANS_okay )
> @@ -2970,7 +2984,7 @@ void hvm_task_switch(
> goto out;
>
> rc = hvm_copy_from_guest_linear(
> - &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo);
> + &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo, NULL);
> if ( rc == HVMTRANS_bad_linear_to_gfn )
> hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
> /*
> @@ -3081,7 +3095,7 @@ void hvm_task_switch(
> enum hvm_translation_result hvm_translate_get_page(
> struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec,
> pagefault_info_t *pfinfo, struct page_info **page_p,
> - gfn_t *gfn_p, p2m_type_t *p2mt_p)
> + gfn_t *gfn_p, p2m_type_t *p2mt_p, struct hvmemul_cache *cache)
> {
> struct page_info *page;
> p2m_type_t p2mt;
> @@ -3089,7 +3103,7 @@ enum hvm_translation_result hvm_translat
>
> if ( linear )
> {
> - gfn = paging_gla_to_gfn(v, addr, &pfec, NULL);
> + gfn = paging_gla_to_gfn(v, addr, &pfec, cache);
>
> if ( gfn_eq(gfn, INVALID_GFN) )
> {
> @@ -3161,7 +3175,7 @@ enum hvm_translation_result hvm_translat
> #define HVMCOPY_linear (1u<<2)
> static enum hvm_translation_result __hvm_copy(
> void *buf, paddr_t addr, int size, struct vcpu *v, unsigned int flags,
> - uint32_t pfec, pagefault_info_t *pfinfo)
> + uint32_t pfec, pagefault_info_t *pfinfo, struct hvmemul_cache *cache)
> {
> gfn_t gfn;
> struct page_info *page;
> @@ -3194,8 +3208,8 @@ static enum hvm_translation_result __hvm
>
> count = min_t(int, PAGE_SIZE - gpa, todo);
>
> - res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear,
> - pfec, pfinfo, &page, &gfn, &p2mt);
> + res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear, pfec,
> + pfinfo, &page, &gfn, &p2mt, cache);
> if ( res != HVMTRANS_okay )
> return res;
>
> @@ -3242,14 +3256,14 @@ enum hvm_translation_result hvm_copy_to_
> paddr_t paddr, void *buf, int size, struct vcpu *v)
> {
> return __hvm_copy(buf, paddr, size, v,
> - HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL);
> + HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL, NULL);
> }
>
> enum hvm_translation_result hvm_copy_from_guest_phys(
> void *buf, paddr_t paddr, int size)
> {
> return __hvm_copy(buf, paddr, size, current,
> - HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL);
> + HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL, NULL);
> }
>
> enum hvm_translation_result hvm_copy_to_guest_linear(
> @@ -3258,16 +3272,17 @@ enum hvm_translation_result hvm_copy_to_
> {
> return __hvm_copy(buf, addr, size, current,
> HVMCOPY_to_guest | HVMCOPY_linear,
> - PFEC_page_present | PFEC_write_access | pfec,
> pfinfo);
> + PFEC_page_present | PFEC_write_access | pfec,
> + pfinfo, NULL);
> }
>
> enum hvm_translation_result hvm_copy_from_guest_linear(
> void *buf, unsigned long addr, int size, uint32_t pfec,
> - pagefault_info_t *pfinfo)
> + pagefault_info_t *pfinfo, struct hvmemul_cache *cache)
> {
> return __hvm_copy(buf, addr, size, current,
> HVMCOPY_from_guest | HVMCOPY_linear,
> - PFEC_page_present | pfec, pfinfo);
> + PFEC_page_present | pfec, pfinfo, cache);
> }
>
> unsigned long copy_to_user_hvm(void *to, const void *from, unsigned int
> len)
> @@ -3308,7 +3323,8 @@ unsigned long copy_from_user_hvm(void *t
> return 0;
> }
>
> - rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len, 0, NULL);
> + rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len,
> + 0, NULL, NULL);
> return rc ? len : 0; /* fake a copy_from_user() return code */
> }
>
> @@ -3724,7 +3740,7 @@ void hvm_ud_intercept(struct cpu_user_re
> sizeof(sig), hvm_access_insn_fetch,
> cs, &addr) &&
> (hvm_copy_from_guest_linear(sig, addr, sizeof(sig),
> - walk, NULL) == HVMTRANS_okay) &&
> + walk, NULL, NULL) ==
> HVMTRANS_okay) &&
> (memcmp(sig, "\xf\xbxen", sizeof(sig)) == 0) )
> {
> regs->rip += sizeof(sig);
> --- a/xen/arch/x86/hvm/svm/svm.c
> +++ b/xen/arch/x86/hvm/svm/svm.c
> @@ -1358,7 +1358,7 @@ static void svm_emul_swint_injection(str
> goto raise_exception;
>
> rc = hvm_copy_from_guest_linear(&idte, idte_linear_addr, idte_size,
> - PFEC_implicit, &pfinfo);
> + PFEC_implicit, &pfinfo, NULL);
> if ( rc )
> {
> if ( rc == HVMTRANS_bad_linear_to_gfn )
> --- a/xen/arch/x86/hvm/vmx/vvmx.c
> +++ b/xen/arch/x86/hvm/vmx/vvmx.c
> @@ -475,7 +475,7 @@ static int decode_vmx_inst(struct cpu_us
> {
> pagefault_info_t pfinfo;
> int rc = hvm_copy_from_guest_linear(poperandS, base, size,
> - 0, &pfinfo);
> + 0, &pfinfo, NULL);
>
> if ( rc == HVMTRANS_bad_linear_to_gfn )
> hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
> --- a/xen/arch/x86/mm/shadow/common.c
> +++ b/xen/arch/x86/mm/shadow/common.c
> @@ -166,7 +166,7 @@ const struct x86_emulate_ops *shadow_ini
> hvm_access_insn_fetch, sh_ctxt, &addr) &&
> !hvm_copy_from_guest_linear(
> sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf),
> - PFEC_insn_fetch, NULL))
> + PFEC_insn_fetch, NULL, NULL))
> ? sizeof(sh_ctxt->insn_buf) : 0;
>
> return &hvm_shadow_emulator_ops;
> @@ -201,7 +201,7 @@ void shadow_continue_emulation(struct sh
> hvm_access_insn_fetch, sh_ctxt, &addr) &&
> !hvm_copy_from_guest_linear(
> sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf),
> - PFEC_insn_fetch, NULL))
> + PFEC_insn_fetch, NULL, NULL))
> ? sizeof(sh_ctxt->insn_buf) : 0;
> sh_ctxt->insn_buf_eip = regs->rip;
> }
> --- a/xen/arch/x86/mm/shadow/hvm.c
> +++ b/xen/arch/x86/mm/shadow/hvm.c
> @@ -125,7 +125,7 @@ hvm_read(enum x86_segment seg,
> rc = hvm_copy_from_guest_linear(p_data, addr, bytes,
> (access_type == hvm_access_insn_fetch
> ? PFEC_insn_fetch : 0),
> - &pfinfo);
> + &pfinfo, NULL);
>
> switch ( rc )
> {
> --- a/xen/include/asm-x86/hvm/emulate.h
> +++ b/xen/include/asm-x86/hvm/emulate.h
> @@ -99,6 +99,11 @@ int hvmemul_do_pio_buffer(uint16_t port,
> void *buffer);
>
> struct hvmemul_cache;
> +struct hvmemul_cache *hvmemul_cache_init(unsigned int nents);
> +static inline void hvmemul_cache_destroy(struct hvmemul_cache *cache)
> +{
> + xfree(cache);
> +}
> bool hvmemul_read_cache(const struct hvmemul_cache *, paddr_t gpa,
> unsigned int level, void *buffer, unsigned int
> size);
> void hvmemul_write_cache(struct hvmemul_cache *, paddr_t gpa,
> --- a/xen/include/asm-x86/hvm/support.h
> +++ b/xen/include/asm-x86/hvm/support.h
> @@ -99,7 +99,7 @@ enum hvm_translation_result hvm_copy_to_
> pagefault_info_t *pfinfo);
> enum hvm_translation_result hvm_copy_from_guest_linear(
> void *buf, unsigned long addr, int size, uint32_t pfec,
> - pagefault_info_t *pfinfo);
> + pagefault_info_t *pfinfo, struct hvmemul_cache *cache);
>
> /*
> * Get a reference on the page under an HVM physical or linear address. If
> @@ -110,7 +110,7 @@ enum hvm_translation_result hvm_copy_fro
> enum hvm_translation_result hvm_translate_get_page(
> struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec,
> pagefault_info_t *pfinfo, struct page_info **page_p,
> - gfn_t *gfn_p, p2m_type_t *p2mt_p);
> + gfn_t *gfn_p, p2m_type_t *p2mt_p, struct hvmemul_cache *cache);
>
> #define HVM_HCALL_completed 0 /* hypercall completed - no further action
> */
> #define HVM_HCALL_preempted 1 /* hypercall preempted - re-execute VMCALL
> */
> --- a/xen/include/asm-x86/hvm/vcpu.h
> +++ b/xen/include/asm-x86/hvm/vcpu.h
> @@ -53,8 +53,6 @@ struct hvm_mmio_cache {
> uint8_t buffer[32];
> };
>
> -struct hvmemul_cache;
> -
> struct hvm_vcpu_io {
> /* I/O request in flight to device model. */
> enum hvm_io_completion io_completion;
> @@ -200,6 +198,7 @@ struct hvm_vcpu {
> u8 cache_mode;
>
> struct hvm_vcpu_io hvm_io;
> + struct hvmemul_cache *data_cache;
>
> /* Pending hw/sw interrupt (.vector = -1 means nothing pending). */
> struct x86_event inject_event;
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxxxxxxxxx
> https://lists.xenproject.org/mailman/listinfo/xen-devel
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |