Xen project Mailing List

Re: [Xen-devel] [PATCH 15/17] vmx: nest: virtual ept for nested

From: Tim Deegan <Tim.Deegan@xxxxxxxxxx>

Date: Thu, 20 May 2010 13:21:51 +0100

Cc: "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>

Delivery-date: Thu, 20 May 2010 05:22:51 -0700

List-id: Xen developer discussion <xen-devel.lists.xensource.com>

At 10:41 +0100 on 22 Apr (1271932887), Qing He wrote: > This patch adds virtual ept capability to L1. > It's implemented as a simple per vCPU vTLB like component > independent to domain wide p2m. > > Signed-off-by: Qing He <qing.he@xxxxxxxxx> > diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/nest.c > --- a/xen/arch/x86/hvm/vmx/nest.c Thu Apr 22 22:30:09 2010 +0800 > +++ b/xen/arch/x86/hvm/vmx/nest.c Thu Apr 22 22:30:10 2010 +0800 > @@ -26,6 +26,7 @@ > #include <asm/hvm/vmx/vmx.h> > #include <asm/hvm/vmx/vvmcs.h> > #include <asm/hvm/vmx/nest.h> > +#include <asm/hvm/vmx/vept.h> > > /* > * VMX instructions support functions > @@ -295,6 +296,9 @@ > __vmptrld(virt_to_maddr(nest->hvmcs)); > v->arch.hvm_vmx.launched = 0; > > + nest->geptp = 0; > + nest->vept = vept_init(v); > + > vmreturn(regs, VMSUCCEED); > > out: > @@ -313,6 +317,9 @@ > if ( unlikely(!nest->guest_vmxon_pa) ) > goto invalid_op; > > + vept_teardown(nest->vept); > + nest->vept = 0; > + > nest->guest_vmxon_pa = 0; > __vmpclear(virt_to_maddr(nest->svmcs)); > > @@ -529,6 +536,67 @@ > return vmx_nest_handle_vmresume(regs); > } > > +int vmx_nest_handle_invept(struct cpu_user_regs *regs) > +{ > + struct vcpu *v = current; > + struct vmx_inst_decoded decode; > + struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest; > + mfn_t mfn; > + u64 eptp; > + int type; > + > + if ( unlikely(!nest->guest_vmxon_pa) ) > + goto invalid_op; > + > + decode_vmx_inst(regs, &decode); > + > + hvm_copy_from_guest_virt(&eptp, decode.mem, sizeof(eptp), 0); > + type = reg_read(regs, decode.reg2); Needs error handling like the other new instructions. > + /* TODO: physical invept on other cpus */ ? > + switch ( type ) > + { > + case 1: > + mfn = vept_invalidate(nest->vept, eptp); > + if ( eptp == nest->geptp ) > + nest->geptp = 0; > + > + if ( __mfn_valid(mfn_x(mfn)) ) > + __invept(1, mfn_x(mfn) << PAGE_SHIFT | (eptp & 0xfff), 0); > + break; > + case 2: > + vept_invalidate_all(nest->vept); > + nest->geptp = 0; > + break; > + default: > + gdprintk(XENLOG_ERR, "nest: unsupported invept type %d\n", type); > + break; > + } > + > + vmreturn(regs, VMSUCCEED); > + > + return X86EMUL_OKAY; > + > +invalid_op: > + hvm_inject_exception(TRAP_invalid_op, 0, 0); > + return X86EMUL_EXCEPTION; > +} > + > +int vmx_nest_vept(struct vcpu *v) > +{ > + struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest; > + int r = 0; > + > + if ( paging_mode_hap(v->domain) && > + (__get_vvmcs(nest->vvmcs, CPU_BASED_VM_EXEC_CONTROL) & > + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && > + (__get_vvmcs(nest->vvmcs, SECONDARY_VM_EXEC_CONTROL) & > + SECONDARY_EXEC_ENABLE_EPT) ) > + r = 1; > + > + return r; > +} > + > /* > * Nested VMX context switch > */ > @@ -739,7 +807,14 @@ > vvmcs_to_shadow(nest->vvmcs, CR0_GUEST_HOST_MASK); > vvmcs_to_shadow(nest->vvmcs, CR4_GUEST_HOST_MASK); > > - /* TODO: PDPTRs for nested ept */ > + if ( vmx_nest_vept(v) ) > + { > + vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR0); > + vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR1); > + vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR2); > + vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR3); > + } > + > /* TODO: CR3 target control */ > } > > @@ -787,14 +862,32 @@ > } > #endif > > + > + /* loading EPT_POINTER for L2 */ > + if ( vmx_nest_vept(v) ) > + { > + u64 geptp; > + mfn_t mfn; > + > + geptp = __get_vvmcs(nest->vvmcs, EPT_POINTER); > + if ( geptp != nest->geptp ) > + { > + mfn = vept_load_eptp(nest->vept, geptp); What if vept_load_eptp() returns INVALID_MFN? > + nest->geptp = geptp; > + > + __vmwrite(EPT_POINTER, (mfn_x(mfn) << PAGE_SHIFT) | 0x1e); > +#ifdef __i386__ > + __vmwrite(EPT_POINTER_HIGH, (mfn_x(mfn) << PAGE_SHIFT) >> 32); > +#endif > + } > + } > + > regs->rip = __get_vvmcs(nest->vvmcs, GUEST_RIP); > regs->rsp = __get_vvmcs(nest->vvmcs, GUEST_RSP); > regs->rflags = __get_vvmcs(nest->vvmcs, GUEST_RFLAGS); > > /* updating host cr0 to sync TS bit */ > __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); > - > - /* TODO: EPT_POINTER */ > } > > static void sync_vvmcs_guest_state(struct vmx_nest_struct *nest) > @@ -1064,8 +1157,26 @@ > break; > } > > + case EXIT_REASON_EPT_VIOLATION: > + { > + unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION); > + paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS); > +#ifdef __i386__ > + gpa |= (paddr_t)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32; > +#endif > + if ( vmx_nest_vept(v) ) > + { > + if ( !vept_ept_violation(nest->vept, nest->geptp, > + exit_qualification, gpa) ) > + bypass_l0 = 1; > + else > + nest->vmexit_pending = 1; Since bypass_l0 is set from vmexit_pending() here it looks like it's always going to be set. Does that mean we never handle a real EPT violation at L0? I would expect there to be three possible outcomes here: give the violation to L1, give it to L0, or fix it in the vept and discard it. > + } > + > + break; > + } > + > case EXIT_REASON_WBINVD: > - case EXIT_REASON_EPT_VIOLATION: > case EXIT_REASON_EPT_MISCONFIG: > case EXIT_REASON_EXTERNAL_INTERRUPT: > /* pass to L0 handler */ > @@ -1229,11 +1340,14 @@ > data = (data << 32) | eax; > break; > case MSR_IA32_VMX_PROCBASED_CTLS: > + mask = paging_mode_hap(current->domain)? > + 0: CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; > + > rdmsr(regs->ecx, eax, edx); > #define REMOVED_EXEC_CONTROL_CAP (CPU_BASED_TPR_SHADOW \ > - | CPU_BASED_ACTIVATE_MSR_BITMAP \ > - | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) > + | CPU_BASED_ACTIVATE_MSR_BITMAP) > data = edx & ~REMOVED_EXEC_CONTROL_CAP; > + data = edx & ~mask; > data = (data << 32) | eax; > break; > case MSR_IA32_VMX_EXIT_CTLS: > @@ -1254,12 +1368,20 @@ > data = (data << 32) | eax; > break; > case MSR_IA32_VMX_PROCBASED_CTLS2: > - mask = 0; > + mask = paging_mode_hap(current->domain)? > + SECONDARY_EXEC_ENABLE_EPT : 0; > > rdmsr(regs->ecx, eax, edx); > data = edx & mask; > data = (data << 32) | eax; > break; > + case MSR_IA32_VMX_EPT_VPID_CAP: > + rdmsr(regs->ecx, eax, edx); > +#define REMOVED_EPT_VPID_CAP_HIGH ( 1 | 1<<8 | 1<<9 | 1<<10 | 1<<11 ) > +#define REMOVED_EPT_VPID_CAP_LOW ( 1<<16 | 1<<17 | 1<<26 ) > + data = edx & ~REMOVED_EPT_VPID_CAP_HIGH; > + data = (data << 32) | (eax & ~REMOVED_EPT_VPID_CAP_LOW); > + break; > > /* pass through MSRs */ > case IA32_FEATURE_CONTROL_MSR: > diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/vept.c > --- /dev/null Thu Jan 01 00:00:00 1970 +0000 > +++ b/xen/arch/x86/hvm/vmx/vept.c Thu Apr 22 22:30:10 2010 +0800 > @@ -0,0 +1,574 @@ > +/* > + * vept.c: virtual EPT for nested virtualization > + * > + * Copyright (c) 2010, Intel Corporation. > + * Author: Qing He <qing.he@xxxxxxxxx> > + * > + * This program is free software; you can redistribute it and/or modify it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for > + * more details. > + * > + * You should have received a copy of the GNU General Public License along > with > + * this program; if not, write to the Free Software Foundation, Inc., 59 > Temple > + * Place - Suite 330, Boston, MA 02111-1307 USA. > + * > + */ > + > +#include <xen/config.h> > +#include <xen/types.h> > +#include <xen/list.h> > +#include <xen/mm.h> > +#include <xen/paging.h> > +#include <xen/domain_page.h> > +#include <xen/sched.h> > +#include <asm/page.h> > +#include <xen/numa.h> > +#include <asm/hvm/vmx/vmx.h> > +#include <asm/hvm/vmx/vept.h> > + > +#undef mfn_to_page > +#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m)) > +#undef mfn_valid > +#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn)) > +#undef page_to_mfn > +#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) > + > +/* > + * This virtual EPT implementation is independent to p2m facility > + * and has some different characteristics. It works in a similar > + * way as shadow page table (guest table and host table composition), > + * but is per-vcpu, and of vTLB style > + * - per vCPU so no lock is required What happens when dom0 changes domU's p2m table? Don't you need to shoot down existing vEPT tables from a foreign CPU? > + * - vTLB style signifies honoring all invalidations, and not > + * write protection. Unlike ordinary page table, since EPT updates > + * and invalidations are minimal in a well written VMM, overhead > + * is also minimized. > + * > + * The physical root is loaded directly to L2 sVMCS, without entering > + * any other host controls. Multiple `cache slots' are maintained > + * for multiple guest EPTPs, with simple LRU replacement. > + * > + * One of the limitations so far, is that it doesn't work with > + * L0 emulation code, so L1 p2m_mmio_direct on top of L0 p2m_mmio_dm > + * is not supported as for now. Is this something you intend to fix before we check it in? > + */ > + > +#define VEPT_MAX_SLOTS 8 > +#define VEPT_ALLOCATION_SIZE 512 > + > +struct vept_slot { > + u64 eptp; /* guest eptp */ > + mfn_t root; /* root of phys table */ > + struct list_head list; > + > + struct page_list_head page_list; > +}; > + > +struct vept { > + struct list_head used_slots; /* lru: new->tail, old->head */ > + struct list_head free_slots; > + > + int total_pages; > + int free_pages; > + struct page_list_head freelist; > + > + struct vcpu *vcpu; > +}; > + > + > +static struct vept_slot *__get_eptp_slot(struct vept *vept, u64 geptp) > +{ > + struct vept_slot *slot, *tmp; > + > + list_for_each_entry_safe( slot, tmp, &vept->used_slots, list ) > + if ( slot->eptp == geptp ) > + return slot; > + > + return NULL; > +} > + > +static struct vept_slot *get_eptp_slot(struct vept *vept, u64 geptp) > +{ > + struct vept_slot *slot; > + > + slot = __get_eptp_slot(vept, geptp); > + if ( slot != NULL ) > + list_del(&slot->list); > + > + return slot; > +} > + > +static void __clear_slot(struct vept *vept, struct vept_slot *slot) > +{ > + struct page_info *pg; > + > + slot->eptp = 0; > + > + while ( !page_list_empty(&slot->page_list) ) > + { > + pg = page_list_remove_head(&slot->page_list); > + page_list_add_tail(pg, &vept->freelist); > + > + vept->free_pages++; > + } > +} > + > +static struct vept_slot *get_free_slot(struct vept *vept) > +{ > + struct vept_slot *slot = NULL; > + > + if ( !list_empty(&vept->free_slots) ) > + { > + slot = list_entry(vept->free_slots.next, struct vept_slot, list); > + list_del(&slot->list); > + } > + else if ( !list_empty(&vept->used_slots) ) > + { > + slot = list_entry(vept->used_slots.next, struct vept_slot, list); > + list_del(&slot->list); > + __clear_slot(vept, slot); > + } > + > + return slot; > +} > + > +static void clear_all_slots(struct vept *vept) > +{ > + struct vept_slot *slot, *tmp; > + > + list_for_each_entry_safe( slot, tmp, &vept->used_slots, list ) > + { > + list_del(&slot->list); > + __clear_slot(vept, slot); > + list_add_tail(&slot->list, &vept->free_slots); > + } > +} > + > +static int free_some_pages(struct vept *vept, struct vept_slot *curr) > +{ > + struct vept_slot *slot; > + int r = 0; > + > + if ( !list_empty(&vept->used_slots) ) > + { > + slot = list_entry(vept->used_slots.next, struct vept_slot, list); > + if ( slot != curr ) > + { > + list_del(&slot->list); > + __clear_slot(vept, slot); > + list_add_tail(&slot->list, &vept->free_slots); > + > + r = 1; > + } > + } > + > + return r; > +} > + > +struct vept *vept_init(struct vcpu *v) > +{ > + struct vept *vept; > + struct vept_slot *slot; > + struct page_info *pg; > + int i; > + > + vept = xmalloc(struct vept); > + if ( vept == NULL ) > + goto out; > + > + memset(vept, 0, sizeof(*vept)); > + vept->vcpu = v; > + > + INIT_PAGE_LIST_HEAD(&vept->freelist); > + INIT_LIST_HEAD(&vept->used_slots); > + INIT_LIST_HEAD(&vept->free_slots); > + > + for ( i = 0; i < VEPT_MAX_SLOTS; i++ ) > + { > + slot = xmalloc(struct vept_slot); > + if ( slot == NULL ) > + break; > + > + memset(slot, 0, sizeof(*slot)); > + > + INIT_LIST_HEAD(&slot->list); > + INIT_PAGE_LIST_HEAD(&slot->page_list); > + > + list_add(&slot->list, &vept->free_slots); > + } > + > + for ( i = 0; i < VEPT_ALLOCATION_SIZE; i++ ) Why a fixed 2MB allocation? What if your nested domains are very large? > + { > + pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(v->domain))); Shouldn't this be allocated from the paging pool like other EPT memory? > + if ( pg == NULL ) > + break; Return an error? > + page_list_add_tail(pg, &vept->freelist); > + vept->total_pages++; > + vept->free_pages++; > + } > + > + out: > + return vept; > +} > + > +void vept_teardown(struct vept *vept) > +{ > + struct page_info *pg; > + struct vept_slot *slot, *tmp; > + > + clear_all_slots(vept); > + > + while ( !page_list_empty(&vept->freelist) ) > + { > + pg = page_list_remove_head(&vept->freelist); > + free_domheap_page(pg); > + vept->free_pages++; > + vept->total_pages++; > + } > + > + list_for_each_entry_safe( slot, tmp, &vept->free_slots, list ) > + xfree(slot); > + > + xfree(vept); > +} > + > +mfn_t vept_load_eptp(struct vept *vept, u64 geptp) > +{ > + struct page_info *pg; > + struct vept_slot *slot; > + mfn_t mfn = _mfn(INVALID_MFN); > + void *addr; > + > + ASSERT(vept->vcpu == current); > + > + slot = get_eptp_slot(vept, geptp); > + if ( slot == NULL ) > + { > + slot = get_free_slot(vept); > + if ( unlikely(slot == NULL) ) > + { > + gdprintk(XENLOG_ERR, "nest: can't get free slot\n"); > + return mfn; > + } > + > + while ( !vept->free_pages ) > + if ( !free_some_pages(vept, slot) ) > + { > + slot->eptp = 0; > + list_add_tail(&slot->list, &vept->free_slots); > + gdprintk(XENLOG_ERR, "nest: vept no free pages\n"); > + > + return mfn; > + } > + > + vept->free_pages--; > + pg = page_list_remove_head(&vept->freelist); > + > + mfn = page_to_mfn(pg); > + addr = map_domain_page(mfn_x(mfn)); > + clear_page(addr); > + unmap_domain_page(addr); > + page_list_add_tail(pg, &slot->page_list); > + slot->eptp = geptp; > + slot->root = mfn; > + } > + > + mfn = slot->root; > + list_add_tail(&slot->list, &vept->used_slots); > + > + return mfn; > +} > + > +mfn_t vept_invalidate(struct vept *vept, u64 geptp) > +{ > + struct vept_slot *slot; > + mfn_t mfn = _mfn(INVALID_MFN); > + > + ASSERT(vept->vcpu == current); > + > + slot = get_eptp_slot(vept, geptp); > + if ( slot != NULL ) > + { > + mfn = slot->root; > + __clear_slot(vept, slot); > + list_add_tail(&slot->list, &vept->free_slots); > + } > + > + return mfn; > +} > + > +void vept_invalidate_all(struct vept *vept) > +{ > + ASSERT(vept->vcpu == current); > + > + clear_all_slots(vept); > +} > + > +/* > + * guest EPT walk and EPT violation > + */ > +struct ept_walk { > + unsigned long gfn; > + unsigned long gfn_remainder; > + ept_entry_t l4e, l3e, l2e, l1e; > + mfn_t l4mfn, l3mfn, l2mfn, l1mfn; > + int sp; > +}; > +typedef struct ept_walk ept_walk_t; > + > +#define GEPT_NORMAL_PAGE 0 > +#define GEPT_SUPER_PAGE 1 > +#define GEPT_NOT_PRESENT 2 > +static int guest_ept_next_level(struct vcpu *v, ept_entry_t **table, > + unsigned long *gfn_remainder, int level, u32 *ar, > + ept_entry_t *entry, mfn_t *next_mfn) > +{ > + int index; > + ept_entry_t *ept_entry; > + ept_entry_t *next; > + p2m_type_t p2mt; > + int rc = GEPT_NORMAL_PAGE; > + mfn_t mfn; > + > + index = *gfn_remainder >> (level * EPT_TABLE_ORDER); > + > + ept_entry = (*table) + index; > + *entry = *ept_entry; > + *ar &= entry->epte & 0x7; > + > + *gfn_remainder &= (1UL << (level * EPT_TABLE_ORDER)) - 1; > + > + if ( !(ept_entry->epte & 0x7) ) > + rc = GEPT_NOT_PRESENT; > + else if ( ept_entry->sp_avail ) > + rc = GEPT_SUPER_PAGE; > + else > + { > + mfn = gfn_to_mfn(v->domain, ept_entry->mfn, &p2mt); > + if ( !p2m_is_ram(p2mt) ) > + return GEPT_NOT_PRESENT; > + > + if ( next_mfn ) > + { > + next = map_domain_page(mfn_x(mfn)); > + unmap_domain_page(*table); > + > + *table = next; > + *next_mfn = mfn; > + } > + } > + > + return rc; > +} > + > +static u32 guest_walk_ept(struct vcpu *v, ept_walk_t *gw, > + u64 geptp, u64 ggpa) > +{ > + ept_entry_t *table; > + p2m_type_t p2mt; > + int rc; > + u32 ar = 0x7; > + > + unsigned long gfn = (unsigned long) (ggpa >> PAGE_SHIFT); > + unsigned long gfn_remainder = gfn; > + > + memset(gw, 0, sizeof(*gw)); > + gw->gfn = gfn; > + gw->sp = 0; > + > + gw->l4mfn = gfn_to_mfn(v->domain, geptp >> PAGE_SHIFT, &p2mt); > + if ( !p2m_is_ram(p2mt) ) > + return 0; > + > + table = map_domain_page(mfn_x(gw->l4mfn)); > + > + rc = guest_ept_next_level(v, &table, &gfn_remainder, 3, &ar, > + &gw->l4e, &gw->l3mfn); > + > + if ( rc ) > + goto out; > + > + rc = guest_ept_next_level(v, &table, &gfn_remainder, 2, &ar, > + &gw->l3e, &gw->l2mfn); > + > + if ( rc == GEPT_SUPER_PAGE ) > + gw->sp = 2; > + if ( rc ) > + goto out; > + > + rc = guest_ept_next_level(v, &table, &gfn_remainder, 1, &ar, > + &gw->l2e, &gw->l1mfn); > + > + if ( rc == GEPT_SUPER_PAGE ) > + gw->sp = 1; > + if ( rc ) > + goto out; > + > + rc = guest_ept_next_level(v, &table, &gfn_remainder, 0, &ar, > + &gw->l1e, NULL); > + > + out: > + gw->gfn_remainder = gfn_remainder; > + unmap_domain_page(*table); > + return ar; > +} > + > +static void epte_set_ar_bits(ept_entry_t *entry, unsigned long ar) > +{ > + entry->epte &= ~0x7f; > + entry->epte |= ar & 0x7f; > +} > + > +static int shadow_ept_next_level(struct vept *vept, struct vept_slot *slot, > + ept_entry_t **table, unsigned long *gfn_remainder, > + int level, u32 *ar, ept_entry_t gentry) > +{ > + int index; > + ept_entry_t *sentry; > + ept_entry_t *next; > + mfn_t mfn; > + struct page_info *pg; > + > + index = *gfn_remainder >> (level * EPT_TABLE_ORDER); > + > + sentry = (*table) + index; > + *ar = sentry->epte & 0x7; > + > + *gfn_remainder &= (1UL << (level * EPT_TABLE_ORDER)) - 1; > + > + if ( !(sentry->epte & 0x7) ) > + { > + while ( !vept->free_pages ) > + if ( !free_some_pages(vept, slot) ) > + { > + gdprintk(XENLOG_ERR, "nest: vept no free pages\n"); > + return 0; > + } > + > + vept->free_pages--; > + pg = page_list_remove_head(&vept->freelist); > + page_list_add_tail(pg, &slot->page_list); > + mfn = page_to_mfn(pg); > + next = map_domain_page(mfn_x(mfn)); > + clear_page(next); > + > + sentry->mfn = mfn_x(mfn); > + } > + else > + { > + next = map_domain_page(sentry->mfn); > + } > + > + epte_set_ar_bits(sentry, gentry.epte); > + > + unmap_domain_page(*table); > + *table = next; > + > + return 1; > +} > + > +int vept_ept_violation(struct vept *vept, u64 geptp, > + unsigned long qualification, paddr_t addr) > +{ > + ept_walk_t gw; > + struct vept_slot *slot; > + ept_entry_t *table, *gept; > + ept_entry_t *sentry, *gentry; > + u32 old_entry, sp_ar = 0; > + p2m_type_t p2mt; > + unsigned long mfn_start = 0; > + unsigned long gfn_remainder; > + int rc, i; > + > + ASSERT(vept->vcpu == current); > + > + slot = __get_eptp_slot(vept, geptp); > + if ( unlikely(slot == NULL) ) > + return 0; > + > + rc = guest_walk_ept(vept->vcpu, &gw, geptp, addr); > + > + if ( !(rc & (qualification & 0x7)) ) /* inject to guest */ > + return 1; > + > + if ( gw.sp == 2 ) /* 1G */ > + { > + sp_ar = gw.l3e.epte & 0x7; > + mfn_start = gw.l3e.mfn + > + (gw.gfn_remainder & (~(1 << EPT_TABLE_ORDER) - 1)); > + } > + if ( gw.sp == 1 ) /* 2M */ > + { > + sp_ar = gw.l2e.epte & 0x7; > + mfn_start = gw.l2e.mfn; > + } > + else > + mfn_start = 0; > + > + table = map_domain_page(mfn_x(slot->root)); > + gfn_remainder = gw.gfn; > + > + shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 3, > + &old_entry, gw.l4e); What if shadow_ept_next_level() returns 0 ? > + shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 2, > + &old_entry, gw.l3e); Ditto > + shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 1, > + &old_entry, (gw.sp == 2) ? gw.l3e : gw.l2e); Ditto > + /* if l1p is just allocated, do a full prefetch */ > + if ( !old_entry && !gw.sp ) > + { > + gept = map_domain_page(mfn_x(gw.l1mfn)); > + for ( i = 0; i < 512; i++ ) > + { > + gentry = gept + i; > + sentry = table + i; > + if ( gentry->epte & 0x7 ) > + { > + sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain, > + gentry->mfn, &p2mt)); > + epte_set_ar_bits(sentry, gentry->epte); > + } > + else > + sentry->epte = 0; > + } > + unmap_domain_page(gept); > + } > + else if ( !old_entry && gw.sp ) > + { > + for ( i = 0; i < 512; i++ ) > + { > + sentry = table + i; > + sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain, > + mfn_start + i, &p2mt)); > + epte_set_ar_bits(sentry, sp_ar); > + } > + } > + else if ( old_entry && !gw.sp ) > + { > + i = gw.gfn & ((1 << EPT_TABLE_ORDER) - 1); > + sentry = table + i; > + sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain, > + gw.l1e.mfn, &p2mt)); > + epte_set_ar_bits(sentry, gw.l1e.epte); > + } > + else // old_entry && gw.sp > + { > + i = gw.gfn & ((1 << EPT_TABLE_ORDER) - 1); > + sentry = table + i; > + sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain, > + mfn_start + i, &p2mt)); > + epte_set_ar_bits(sentry, sp_ar); > + } > + > + unmap_domain_page(table); > + return 0; > +} > diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/vmx.c > --- a/xen/arch/x86/hvm/vmx/vmx.c Thu Apr 22 22:30:09 2010 +0800 > +++ b/xen/arch/x86/hvm/vmx/vmx.c Thu Apr 22 22:30:10 2010 +0800 > @@ -1032,6 +1032,14 @@ > p2m_type_t p2mt; > char *p; > > + /* > + * If in nesting EPT operation, L0 doesn't have the knowledge on > + * how to interpret CR3, it's L1's responsibility to provide > + * GUEST_PDPTRn, we rely solely on them. > + */ > + if ( v->arch.hvm_vcpu.in_nesting && vmx_nest_vept(v) ) > + return; > + > /* EPT needs to load PDPTRS into VMCS for PAE. */ > if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) ) > return; > @@ -2705,6 +2713,11 @@ > if ( vmx_nest_handle_vmxon(regs) == X86EMUL_OKAY ) > __update_guest_eip(inst_len); > break; > + case EXIT_REASON_INVEPT: > + inst_len = __get_instruction_length(); > + if ( vmx_nest_handle_invept(regs) == X86EMUL_OKAY ) > + __update_guest_eip(inst_len); > + break; > > case EXIT_REASON_MWAIT_INSTRUCTION: > case EXIT_REASON_MONITOR_INSTRUCTION: > diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/include/asm-x86/hvm/vmx/nest.h > --- a/xen/include/asm-x86/hvm/vmx/nest.h Thu Apr 22 22:30:09 2010 +0800 > +++ b/xen/include/asm-x86/hvm/vmx/nest.h Thu Apr 22 22:30:10 2010 +0800 > @@ -47,6 +47,9 @@ > > unsigned long intr_info; > unsigned long error_code; > + > + u64 geptp; > + struct vept *vept; > }; > > asmlinkage void vmx_nest_switch_mode(void); > @@ -64,6 +67,8 @@ > int vmx_nest_handle_vmresume(struct cpu_user_regs *regs); > int vmx_nest_handle_vmlaunch(struct cpu_user_regs *regs); > > +int vmx_nest_handle_invept(struct cpu_user_regs *regs); > + > void vmx_nest_update_exec_control(struct vcpu *v, unsigned long value); > void vmx_nest_update_secondary_exec_control(struct vcpu *v, > unsigned long value); > @@ -81,4 +86,6 @@ > int vmx_nest_msr_write_intercept(struct cpu_user_regs *regs, > u64 msr_content); > > +int vmx_nest_vept(struct vcpu *v); > + > #endif /* __ASM_X86_HVM_NEST_H__ */ > diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/include/asm-x86/hvm/vmx/vept.h > --- /dev/null Thu Jan 01 00:00:00 1970 +0000 > +++ b/xen/include/asm-x86/hvm/vmx/vept.h Thu Apr 22 22:30:10 2010 +0800 > @@ -0,0 +1,10 @@ > +#include <asm/hvm/vmx/vmx.h> > + > + > +struct vept *vept_init(struct vcpu *v); > +void vept_teardown(struct vept *vept); > +mfn_t vept_load_eptp(struct vept *vept, u64 eptp); > +mfn_t vept_invalidate(struct vept *vept, u64 eptp); > +void vept_invalidate_all(struct vept *vept); > +int vept_ept_violation(struct vept *vept, u64 eptp, > + unsigned long qualification, paddr_t addr); > > _______________________________________________ > Xen-devel mailing list > Xen-devel@xxxxxxxxxxxxxxxxxxx > http://lists.xensource.com/xen-devel -- Tim Deegan <Tim.Deegan@xxxxxxxxxx> Principal Software Engineer, XenServer Engineering Citrix Systems UK Ltd. (Company #02937203, SL9 0BG) _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.