Xen project Mailing List

[Xen-devel] Ping: [PATCH v3 3/4] x86/HVM: implement memory read caching

>>> On 25.09.18 at 16:25, <JBeulich@xxxxxxxx> wrote: > Emulation requiring device model assistance uses a form of instruction > re-execution, assuming that the second (and any further) pass takes > exactly the same path. This is a valid assumption as far use of CPU > registers goes (as those can't change without any other instruction > executing in between), but is wrong for memory accesses. In particular > it has been observed that Windows might page out buffers underneath an > instruction currently under emulation (hitting between two passes). If > the first pass translated a linear address successfully, any subsequent > pass needs to do so too, yielding the exact same translation. > > Introduce a cache (used by just guest page table accesses for now) to > make sure above described assumption holds. This is a very simplistic > implementation for now: Only exact matches are satisfied (no overlaps or > partial reads or anything). > > As to the actual data page in this scenario, there are a couple of > aspects to take into consideration: > - We must be talking about an insn accessing two locations (two memory > ones, one of which is MMIO, or a memory and an I/O one). > - If the non I/O / MMIO side is being read, the re-read (if it occurs at > all) is having its result discarded, by taking the shortcut through > the first switch()'s STATE_IORESP_READY case in hvmemul_do_io(). Note > how, among all the re-issue sanity checks there, we avoid comparing > the actual data. > - If the non I/O / MMIO side is being written, it is the OSes > responsibility to avoid actually moving page contents to disk while > there might still be a write access in flight - this is no different > in behavior from bare hardware. > - Read-modify-write accesses are, as always, complicated, and while we > deal with them better nowadays than we did in the past, we're still > not quite there to guarantee hardware like behavior in all cases > anyway. Nothing is getting worse by the changes made here, afaict. > > Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> > Acked-by: Tim Deegan <tim@xxxxxxx> > Reviewed-by: Paul Durrant <paul.durrant@xxxxxxxxxx> SVM and VMX maintainers? Thanks, Jan > --- > v3: Add text about the actual data page to the description. > v2: Re-base. > > --- a/xen/arch/x86/hvm/emulate.c > +++ b/xen/arch/x86/hvm/emulate.c > @@ -27,6 +27,18 @@ > #include <asm/hvm/svm/svm.h> > #include <asm/vm_event.h> > > +struct hvmemul_cache > +{ > + unsigned int max_ents; > + unsigned int num_ents; > + struct { > + paddr_t gpa:PADDR_BITS; > + unsigned int size:(BITS_PER_LONG - PADDR_BITS) / 2; > + unsigned int level:(BITS_PER_LONG - PADDR_BITS) / 2; > + unsigned long data; > + } ents[]; > +}; > + > static void hvmtrace_io_assist(const ioreq_t *p) > { > unsigned int size, event; > @@ -541,7 +553,7 @@ static int hvmemul_do_mmio_addr(paddr_t > */ > static void *hvmemul_map_linear_addr( > unsigned long linear, unsigned int bytes, uint32_t pfec, > - struct hvm_emulate_ctxt *hvmemul_ctxt) > + struct hvm_emulate_ctxt *hvmemul_ctxt, struct hvmemul_cache *cache) > { > struct vcpu *curr = current; > void *err, *mapping; > @@ -586,7 +598,7 @@ static void *hvmemul_map_linear_addr( > ASSERT(mfn_x(*mfn) == 0); > > res = hvm_translate_get_page(curr, addr, true, pfec, > - &pfinfo, &page, NULL, &p2mt); > + &pfinfo, &page, NULL, &p2mt, cache); > > switch ( res ) > { > @@ -702,6 +714,8 @@ static int hvmemul_linear_to_phys( > gfn_t gfn, ngfn; > unsigned long done, todo, i, offset = addr & ~PAGE_MASK; > int reverse; > + struct hvmemul_cache *cache = pfec & PFEC_insn_fetch > + ? NULL : curr->arch.hvm.data_cache; > > /* > * Clip repetitions to a sensible maximum. This avoids extensive > looping in > @@ -731,7 +745,7 @@ static int hvmemul_linear_to_phys( > return rc; > gfn = gaddr_to_gfn(gaddr); > } > - else if ( gfn_eq(gfn = paging_gla_to_gfn(curr, addr, &pfec, NULL), > + else if ( gfn_eq(gfn = paging_gla_to_gfn(curr, addr, &pfec, cache), > INVALID_GFN) ) > { > if ( pfec & (PFEC_page_paged | PFEC_page_shared) ) > @@ -747,7 +761,7 @@ static int hvmemul_linear_to_phys( > { > /* Get the next PFN in the range. */ > addr += reverse ? -PAGE_SIZE : PAGE_SIZE; > - ngfn = paging_gla_to_gfn(curr, addr, &pfec, NULL); > + ngfn = paging_gla_to_gfn(curr, addr, &pfec, cache); > > /* Is it contiguous with the preceding PFNs? If not then we're > done. */ > if ( gfn_eq(ngfn, INVALID_GFN) || > @@ -1073,7 +1087,10 @@ static int linear_read(unsigned long add > uint32_t pfec, struct hvm_emulate_ctxt > *hvmemul_ctxt) > { > pagefault_info_t pfinfo; > - int rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo); > + int rc = hvm_copy_from_guest_linear(p_data, addr, bytes, pfec, &pfinfo, > + (pfec & PFEC_insn_fetch > + ? NULL > + : current->arch.hvm.data_cache)); > > switch ( rc ) > { > @@ -1270,7 +1287,8 @@ static int hvmemul_write( > > if ( !known_gla(addr, bytes, pfec) ) > { > - mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt); > + mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt, > + current->arch.hvm.data_cache); > if ( IS_ERR(mapping) ) > return ~PTR_ERR(mapping); > } > @@ -1312,7 +1330,8 @@ static int hvmemul_rmw( > > if ( !known_gla(addr, bytes, pfec) ) > { > - mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt); > + mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt, > + current->arch.hvm.data_cache); > if ( IS_ERR(mapping) ) > return ~PTR_ERR(mapping); > } > @@ -1466,7 +1485,8 @@ static int hvmemul_cmpxchg( > else if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 ) > pfec |= PFEC_user_mode; > > - mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt); > + mapping = hvmemul_map_linear_addr(addr, bytes, pfec, hvmemul_ctxt, > + curr->arch.hvm.data_cache); > if ( IS_ERR(mapping) ) > return ~PTR_ERR(mapping); > > @@ -2373,6 +2393,7 @@ static int _hvm_emulate_one(struct hvm_e > { > vio->mmio_cache_count = 0; > vio->mmio_insn_bytes = 0; > + curr->arch.hvm.data_cache->num_ents = 0; > } > else > { > @@ -2591,7 +2612,7 @@ void hvm_emulate_init_per_insn( > &addr) && > hvm_copy_from_guest_linear(hvmemul_ctxt->insn_buf, addr, > sizeof(hvmemul_ctxt->insn_buf), > - pfec | PFEC_insn_fetch, > + pfec | PFEC_insn_fetch, NULL, > NULL) == HVMTRANS_okay) ? > sizeof(hvmemul_ctxt->insn_buf) : 0; > } > @@ -2664,9 +2685,35 @@ void hvm_dump_emulation_state(const char > hvmemul_ctxt->insn_buf); > } > > +struct hvmemul_cache *hvmemul_cache_init(unsigned int nents) > +{ > + struct hvmemul_cache *cache = xmalloc_bytes(offsetof(struct > hvmemul_cache, > + ents[nents])); > + > + if ( cache ) > + { > + cache->num_ents = 0; > + cache->max_ents = nents; > + } > + > + return cache; > +} > + > bool hvmemul_read_cache(const struct hvmemul_cache *cache, paddr_t gpa, > unsigned int level, void *buffer, unsigned int > size) > { > + unsigned int i; > + > + ASSERT(size <= sizeof(cache->ents->data)); > + > + for ( i = 0; i < cache->num_ents; ++i ) > + if ( cache->ents[i].level == level && cache->ents[i].gpa == gpa && > + cache->ents[i].size == size ) > + { > + memcpy(buffer, &cache->ents[i].data, size); > + return true; > + } > + > return false; > } > > @@ -2674,6 +2721,35 @@ void hvmemul_write_cache(struct hvmemul_ > unsigned int level, const void *buffer, > unsigned int size) > { > + unsigned int i; > + > + if ( size > sizeof(cache->ents->data) ) > + { > + ASSERT_UNREACHABLE(); > + return; > + } > + > + for ( i = 0; i < cache->num_ents; ++i ) > + if ( cache->ents[i].level == level && cache->ents[i].gpa == gpa && > + cache->ents[i].size == size ) > + { > + memcpy(&cache->ents[i].data, buffer, size); > + return; > + } > + > + if ( unlikely(i >= cache->max_ents) ) > + { > + ASSERT_UNREACHABLE(); > + return; > + } > + > + cache->ents[i].level = level; > + cache->ents[i].gpa = gpa; > + cache->ents[i].size = size; > + > + memcpy(&cache->ents[i].data, buffer, size); > + > + cache->num_ents = i + 1; > } > > /* > --- a/xen/arch/x86/hvm/hvm.c > +++ b/xen/arch/x86/hvm/hvm.c > @@ -1498,6 +1498,17 @@ int hvm_vcpu_initialise(struct vcpu *v) > > v->arch.hvm.inject_event.vector = HVM_EVENT_VECTOR_UNSET; > > + /* > + * Leaving aside the insn fetch, for which we don't use this cache, no > + * insn can access more than 8 independent linear addresses (AVX2 > + * gathers being the worst). Each such linear range can span a page > + * boundary, i.e. require two page walks. > + */ > + v->arch.hvm.data_cache = hvmemul_cache_init(CONFIG_PAGING_LEVELS * 8 * > 2); > + rc = -ENOMEM; > + if ( !v->arch.hvm.data_cache ) > + goto fail4; > + > rc = setup_compat_arg_xlat(v); /* teardown: free_compat_arg_xlat() */ > if ( rc != 0 ) > goto fail4; > @@ -1527,6 +1538,7 @@ int hvm_vcpu_initialise(struct vcpu *v) > fail5: > free_compat_arg_xlat(v); > fail4: > + hvmemul_cache_destroy(v->arch.hvm.data_cache); > hvm_funcs.vcpu_destroy(v); > fail3: > vlapic_destroy(v); > @@ -1549,6 +1561,8 @@ void hvm_vcpu_destroy(struct vcpu *v) > > free_compat_arg_xlat(v); > > + hvmemul_cache_destroy(v->arch.hvm.data_cache); > + > tasklet_kill(&v->arch.hvm.assert_evtchn_irq_tasklet); > hvm_funcs.vcpu_destroy(v); > > @@ -2923,7 +2937,7 @@ void hvm_task_switch( > } > > rc = hvm_copy_from_guest_linear( > - &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo); > + &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo, NULL); > if ( rc == HVMTRANS_bad_linear_to_gfn ) > hvm_inject_page_fault(pfinfo.ec, pfinfo.linear); > if ( rc != HVMTRANS_okay ) > @@ -2970,7 +2984,7 @@ void hvm_task_switch( > goto out; > > rc = hvm_copy_from_guest_linear( > - &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo); > + &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo, NULL); > if ( rc == HVMTRANS_bad_linear_to_gfn ) > hvm_inject_page_fault(pfinfo.ec, pfinfo.linear); > /* > @@ -3081,7 +3095,7 @@ void hvm_task_switch( > enum hvm_translation_result hvm_translate_get_page( > struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec, > pagefault_info_t *pfinfo, struct page_info **page_p, > - gfn_t *gfn_p, p2m_type_t *p2mt_p) > + gfn_t *gfn_p, p2m_type_t *p2mt_p, struct hvmemul_cache *cache) > { > struct page_info *page; > p2m_type_t p2mt; > @@ -3089,7 +3103,7 @@ enum hvm_translation_result hvm_translat > > if ( linear ) > { > - gfn = paging_gla_to_gfn(v, addr, &pfec, NULL); > + gfn = paging_gla_to_gfn(v, addr, &pfec, cache); > > if ( gfn_eq(gfn, INVALID_GFN) ) > { > @@ -3161,7 +3175,7 @@ enum hvm_translation_result hvm_translat > #define HVMCOPY_linear (1u<<2) > static enum hvm_translation_result __hvm_copy( > void *buf, paddr_t addr, int size, struct vcpu *v, unsigned int flags, > - uint32_t pfec, pagefault_info_t *pfinfo) > + uint32_t pfec, pagefault_info_t *pfinfo, struct hvmemul_cache *cache) > { > gfn_t gfn; > struct page_info *page; > @@ -3194,8 +3208,8 @@ static enum hvm_translation_result __hvm > > count = min_t(int, PAGE_SIZE - gpa, todo); > > - res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear, > - pfec, pfinfo, &page, &gfn, &p2mt); > + res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear, pfec, > + pfinfo, &page, &gfn, &p2mt, cache); > if ( res != HVMTRANS_okay ) > return res; > > @@ -3242,14 +3256,14 @@ enum hvm_translation_result hvm_copy_to_ > paddr_t paddr, void *buf, int size, struct vcpu *v) > { > return __hvm_copy(buf, paddr, size, v, > - HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL); > + HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL, NULL); > } > > enum hvm_translation_result hvm_copy_from_guest_phys( > void *buf, paddr_t paddr, int size) > { > return __hvm_copy(buf, paddr, size, current, > - HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL); > + HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL, NULL); > } > > enum hvm_translation_result hvm_copy_to_guest_linear( > @@ -3258,16 +3272,17 @@ enum hvm_translation_result hvm_copy_to_ > { > return __hvm_copy(buf, addr, size, current, > HVMCOPY_to_guest | HVMCOPY_linear, > - PFEC_page_present | PFEC_write_access | pfec, > pfinfo); > + PFEC_page_present | PFEC_write_access | pfec, > + pfinfo, NULL); > } > > enum hvm_translation_result hvm_copy_from_guest_linear( > void *buf, unsigned long addr, int size, uint32_t pfec, > - pagefault_info_t *pfinfo) > + pagefault_info_t *pfinfo, struct hvmemul_cache *cache) > { > return __hvm_copy(buf, addr, size, current, > HVMCOPY_from_guest | HVMCOPY_linear, > - PFEC_page_present | pfec, pfinfo); > + PFEC_page_present | pfec, pfinfo, cache); > } > > unsigned long copy_to_user_hvm(void *to, const void *from, unsigned int > len) > @@ -3308,7 +3323,8 @@ unsigned long copy_from_user_hvm(void *t > return 0; > } > > - rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len, 0, NULL); > + rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len, > + 0, NULL, NULL); > return rc ? len : 0; /* fake a copy_from_user() return code */ > } > > @@ -3724,7 +3740,7 @@ void hvm_ud_intercept(struct cpu_user_re > sizeof(sig), hvm_access_insn_fetch, > cs, &addr) && > (hvm_copy_from_guest_linear(sig, addr, sizeof(sig), > - walk, NULL) == HVMTRANS_okay) && > + walk, NULL, NULL) == > HVMTRANS_okay) && > (memcmp(sig, "\xf\xbxen", sizeof(sig)) == 0) ) > { > regs->rip += sizeof(sig); > --- a/xen/arch/x86/hvm/svm/svm.c > +++ b/xen/arch/x86/hvm/svm/svm.c > @@ -1358,7 +1358,7 @@ static void svm_emul_swint_injection(str > goto raise_exception; > > rc = hvm_copy_from_guest_linear(&idte, idte_linear_addr, idte_size, > - PFEC_implicit, &pfinfo); > + PFEC_implicit, &pfinfo, NULL); > if ( rc ) > { > if ( rc == HVMTRANS_bad_linear_to_gfn ) > --- a/xen/arch/x86/hvm/vmx/vvmx.c > +++ b/xen/arch/x86/hvm/vmx/vvmx.c > @@ -475,7 +475,7 @@ static int decode_vmx_inst(struct cpu_us > { > pagefault_info_t pfinfo; > int rc = hvm_copy_from_guest_linear(poperandS, base, size, > - 0, &pfinfo); > + 0, &pfinfo, NULL); > > if ( rc == HVMTRANS_bad_linear_to_gfn ) > hvm_inject_page_fault(pfinfo.ec, pfinfo.linear); > --- a/xen/arch/x86/mm/shadow/common.c > +++ b/xen/arch/x86/mm/shadow/common.c > @@ -166,7 +166,7 @@ const struct x86_emulate_ops *shadow_ini > hvm_access_insn_fetch, sh_ctxt, &addr) && > !hvm_copy_from_guest_linear( > sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), > - PFEC_insn_fetch, NULL)) > + PFEC_insn_fetch, NULL, NULL)) > ? sizeof(sh_ctxt->insn_buf) : 0; > > return &hvm_shadow_emulator_ops; > @@ -201,7 +201,7 @@ void shadow_continue_emulation(struct sh > hvm_access_insn_fetch, sh_ctxt, &addr) && > !hvm_copy_from_guest_linear( > sh_ctxt->insn_buf, addr, sizeof(sh_ctxt->insn_buf), > - PFEC_insn_fetch, NULL)) > + PFEC_insn_fetch, NULL, NULL)) > ? sizeof(sh_ctxt->insn_buf) : 0; > sh_ctxt->insn_buf_eip = regs->rip; > } > --- a/xen/arch/x86/mm/shadow/hvm.c > +++ b/xen/arch/x86/mm/shadow/hvm.c > @@ -125,7 +125,7 @@ hvm_read(enum x86_segment seg, > rc = hvm_copy_from_guest_linear(p_data, addr, bytes, > (access_type == hvm_access_insn_fetch > ? PFEC_insn_fetch : 0), > - &pfinfo); > + &pfinfo, NULL); > > switch ( rc ) > { > --- a/xen/include/asm-x86/hvm/emulate.h > +++ b/xen/include/asm-x86/hvm/emulate.h > @@ -99,6 +99,11 @@ int hvmemul_do_pio_buffer(uint16_t port, > void *buffer); > > struct hvmemul_cache; > +struct hvmemul_cache *hvmemul_cache_init(unsigned int nents); > +static inline void hvmemul_cache_destroy(struct hvmemul_cache *cache) > +{ > + xfree(cache); > +} > bool hvmemul_read_cache(const struct hvmemul_cache *, paddr_t gpa, > unsigned int level, void *buffer, unsigned int > size); > void hvmemul_write_cache(struct hvmemul_cache *, paddr_t gpa, > --- a/xen/include/asm-x86/hvm/support.h > +++ b/xen/include/asm-x86/hvm/support.h > @@ -99,7 +99,7 @@ enum hvm_translation_result hvm_copy_to_ > pagefault_info_t *pfinfo); > enum hvm_translation_result hvm_copy_from_guest_linear( > void *buf, unsigned long addr, int size, uint32_t pfec, > - pagefault_info_t *pfinfo); > + pagefault_info_t *pfinfo, struct hvmemul_cache *cache); > > /* > * Get a reference on the page under an HVM physical or linear address. If > @@ -110,7 +110,7 @@ enum hvm_translation_result hvm_copy_fro > enum hvm_translation_result hvm_translate_get_page( > struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec, > pagefault_info_t *pfinfo, struct page_info **page_p, > - gfn_t *gfn_p, p2m_type_t *p2mt_p); > + gfn_t *gfn_p, p2m_type_t *p2mt_p, struct hvmemul_cache *cache); > > #define HVM_HCALL_completed 0 /* hypercall completed - no further action > */ > #define HVM_HCALL_preempted 1 /* hypercall preempted - re-execute VMCALL > */ > --- a/xen/include/asm-x86/hvm/vcpu.h > +++ b/xen/include/asm-x86/hvm/vcpu.h > @@ -53,8 +53,6 @@ struct hvm_mmio_cache { > uint8_t buffer[32]; > }; > > -struct hvmemul_cache; > - > struct hvm_vcpu_io { > /* I/O request in flight to device model. */ > enum hvm_io_completion io_completion; > @@ -200,6 +198,7 @@ struct hvm_vcpu { > u8 cache_mode; > > struct hvm_vcpu_io hvm_io; > + struct hvmemul_cache *data_cache; > > /* Pending hw/sw interrupt (.vector = -1 means nothing pending). */ > struct x86_event inject_event; > > > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@xxxxxxxxxxxxxxxxxxxx > https://lists.xenproject.org/mailman/listinfo/xen-devel _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.