[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 15/17] vmx: nest: virtual ept for nested



At 10:41 +0100 on 22 Apr (1271932887), Qing He wrote:
> This patch adds virtual ept capability to L1.
> It's implemented as a simple per vCPU vTLB like component
> independent to domain wide p2m.
> 
> Signed-off-by: Qing He <qing.he@xxxxxxxxx>

> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/nest.c
> --- a/xen/arch/x86/hvm/vmx/nest.c       Thu Apr 22 22:30:09 2010 +0800
> +++ b/xen/arch/x86/hvm/vmx/nest.c       Thu Apr 22 22:30:10 2010 +0800
> @@ -26,6 +26,7 @@
>  #include <asm/hvm/vmx/vmx.h>
>  #include <asm/hvm/vmx/vvmcs.h>
>  #include <asm/hvm/vmx/nest.h>
> +#include <asm/hvm/vmx/vept.h>
> 
>  /*
>   * VMX instructions support functions
> @@ -295,6 +296,9 @@
>      __vmptrld(virt_to_maddr(nest->hvmcs));
>      v->arch.hvm_vmx.launched = 0;
> 
> +    nest->geptp = 0;
> +    nest->vept = vept_init(v);
> +
>      vmreturn(regs, VMSUCCEED);
> 
>  out:
> @@ -313,6 +317,9 @@
>      if ( unlikely(!nest->guest_vmxon_pa) )
>          goto invalid_op;
> 
> +    vept_teardown(nest->vept);
> +    nest->vept = 0;
> +
>      nest->guest_vmxon_pa = 0;
>      __vmpclear(virt_to_maddr(nest->svmcs));
> 
> @@ -529,6 +536,67 @@
>      return vmx_nest_handle_vmresume(regs);
>  }
> 
> +int vmx_nest_handle_invept(struct cpu_user_regs *regs)
> +{
> +    struct vcpu *v = current;
> +    struct vmx_inst_decoded decode;
> +    struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest;
> +    mfn_t mfn;
> +    u64 eptp;
> +    int type;
> +
> +    if ( unlikely(!nest->guest_vmxon_pa) )
> +        goto invalid_op;
> +
> +    decode_vmx_inst(regs, &decode);
> +
> +    hvm_copy_from_guest_virt(&eptp, decode.mem, sizeof(eptp), 0);
> +    type = reg_read(regs, decode.reg2);

Needs error handling like the other new instructions. 

> +    /* TODO: physical invept on other cpus */

?

> +    switch ( type )
> +    {
> +    case 1:
> +        mfn = vept_invalidate(nest->vept, eptp);
> +        if ( eptp == nest->geptp )
> +            nest->geptp = 0;
> +
> +        if ( __mfn_valid(mfn_x(mfn)) )
> +            __invept(1, mfn_x(mfn) << PAGE_SHIFT | (eptp & 0xfff), 0);
> +        break;
> +    case 2:
> +        vept_invalidate_all(nest->vept);
> +        nest->geptp = 0;
> +        break;
> +    default:
> +        gdprintk(XENLOG_ERR, "nest: unsupported invept type %d\n", type);
> +        break;
> +    }
> +
> +    vmreturn(regs, VMSUCCEED);
> +
> +    return X86EMUL_OKAY;
> +
> +invalid_op:
> +    hvm_inject_exception(TRAP_invalid_op, 0, 0);
> +    return X86EMUL_EXCEPTION;
> +}
> +
> +int vmx_nest_vept(struct vcpu *v)
> +{
> +    struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest;
> +    int r = 0;
> +
> +    if ( paging_mode_hap(v->domain) &&
> +         (__get_vvmcs(nest->vvmcs, CPU_BASED_VM_EXEC_CONTROL) &
> +          CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
> +         (__get_vvmcs(nest->vvmcs, SECONDARY_VM_EXEC_CONTROL) &
> +          SECONDARY_EXEC_ENABLE_EPT) )
> +        r = 1;
> +
> +    return r;
> +}
> +
>  /*
>   * Nested VMX context switch
>   */
> @@ -739,7 +807,14 @@
>      vvmcs_to_shadow(nest->vvmcs, CR0_GUEST_HOST_MASK);
>      vvmcs_to_shadow(nest->vvmcs, CR4_GUEST_HOST_MASK);
> 
> -    /* TODO: PDPTRs for nested ept */
> +    if ( vmx_nest_vept(v) )
> +    {
> +        vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR0);
> +        vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR1);
> +        vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR2);
> +        vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR3);
> +    }
> +
>      /* TODO: CR3 target control */
>  }
> 
> @@ -787,14 +862,32 @@
>      }
>  #endif
> 
> +
> +    /* loading EPT_POINTER for L2 */
> +    if ( vmx_nest_vept(v) )
> +    {
> +        u64 geptp;
> +        mfn_t mfn;
> +
> +        geptp = __get_vvmcs(nest->vvmcs, EPT_POINTER);
> +        if ( geptp != nest->geptp )
> +        {
> +            mfn = vept_load_eptp(nest->vept, geptp);

What if vept_load_eptp() returns INVALID_MFN?

> +            nest->geptp = geptp;
> +
> +            __vmwrite(EPT_POINTER, (mfn_x(mfn) << PAGE_SHIFT) | 0x1e);
> +#ifdef __i386__
> +            __vmwrite(EPT_POINTER_HIGH, (mfn_x(mfn) << PAGE_SHIFT) >> 32);
> +#endif
> +        }
> +    }
> +
>      regs->rip = __get_vvmcs(nest->vvmcs, GUEST_RIP);
>      regs->rsp = __get_vvmcs(nest->vvmcs, GUEST_RSP);
>      regs->rflags = __get_vvmcs(nest->vvmcs, GUEST_RFLAGS);
> 
>      /* updating host cr0 to sync TS bit */
>      __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
> -
> -    /* TODO: EPT_POINTER */
>  }
> 
>  static void sync_vvmcs_guest_state(struct vmx_nest_struct *nest)
> @@ -1064,8 +1157,26 @@
>          break;
>      }
> 
> +    case EXIT_REASON_EPT_VIOLATION:
> +    {
> +        unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
> +        paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
> +#ifdef __i386__
> +        gpa |= (paddr_t)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32;
> +#endif
> +        if ( vmx_nest_vept(v) )
> +        {
> +            if ( !vept_ept_violation(nest->vept, nest->geptp,
> +                     exit_qualification, gpa) )
> +                bypass_l0 = 1;
> +            else
> +                nest->vmexit_pending = 1;

Since bypass_l0 is set from vmexit_pending() here it looks like it's
always going to be set.  Does that mean we never handle a real EPT
violation at L0?  I would expect there to be three possible outcomes
here: give the violation to L1, give it to L0, or fix it in the vept and
discard it.

> +        }
> +
> +        break;
> +    }
> +
>      case EXIT_REASON_WBINVD:
> -    case EXIT_REASON_EPT_VIOLATION:
>      case EXIT_REASON_EPT_MISCONFIG:
>      case EXIT_REASON_EXTERNAL_INTERRUPT:
>          /* pass to L0 handler */
> @@ -1229,11 +1340,14 @@
>          data = (data << 32) | eax;
>          break;
>      case MSR_IA32_VMX_PROCBASED_CTLS:
> +        mask = paging_mode_hap(current->domain)?
> +                   0: CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> +
>          rdmsr(regs->ecx, eax, edx);
>  #define REMOVED_EXEC_CONTROL_CAP (CPU_BASED_TPR_SHADOW \
> -            | CPU_BASED_ACTIVATE_MSR_BITMAP            \
> -            | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
> +            | CPU_BASED_ACTIVATE_MSR_BITMAP)
>          data = edx & ~REMOVED_EXEC_CONTROL_CAP;
> +        data = edx & ~mask;
>          data = (data << 32) | eax;
>          break;
>      case MSR_IA32_VMX_EXIT_CTLS:
> @@ -1254,12 +1368,20 @@
>          data = (data << 32) | eax;
>          break;
>      case MSR_IA32_VMX_PROCBASED_CTLS2:
> -        mask = 0;
> +        mask = paging_mode_hap(current->domain)?
> +                   SECONDARY_EXEC_ENABLE_EPT : 0;
> 
>          rdmsr(regs->ecx, eax, edx);
>          data = edx & mask;
>          data = (data << 32) | eax;
>          break;
> +    case MSR_IA32_VMX_EPT_VPID_CAP:
> +        rdmsr(regs->ecx, eax, edx);
> +#define REMOVED_EPT_VPID_CAP_HIGH   ( 1 | 1<<8 | 1<<9 | 1<<10 | 1<<11 )
> +#define REMOVED_EPT_VPID_CAP_LOW    ( 1<<16 | 1<<17 | 1<<26 )
> +        data = edx & ~REMOVED_EPT_VPID_CAP_HIGH;
> +        data = (data << 32) | (eax & ~REMOVED_EPT_VPID_CAP_LOW);
> +        break;
> 
>      /* pass through MSRs */
>      case IA32_FEATURE_CONTROL_MSR:
> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/vept.c
> --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/arch/x86/hvm/vmx/vept.c       Thu Apr 22 22:30:10 2010 +0800
> @@ -0,0 +1,574 @@
> +/*
> + * vept.c: virtual EPT for nested virtualization
> + *
> + * Copyright (c) 2010, Intel Corporation.
> + * Author: Qing He <qing.he@xxxxxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along 
> with
> + * this program; if not, write to the Free Software Foundation, Inc., 59 
> Temple
> + * Place - Suite 330, Boston, MA 02111-1307 USA.
> + *
> + */
> +
> +#include <xen/config.h>
> +#include <xen/types.h>
> +#include <xen/list.h>
> +#include <xen/mm.h>
> +#include <xen/paging.h>
> +#include <xen/domain_page.h>
> +#include <xen/sched.h>
> +#include <asm/page.h>
> +#include <xen/numa.h>
> +#include <asm/hvm/vmx/vmx.h>
> +#include <asm/hvm/vmx/vept.h>
> +
> +#undef mfn_to_page
> +#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
> +#undef mfn_valid
> +#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
> +#undef page_to_mfn
> +#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
> +
> +/*
> + * This virtual EPT implementation is independent to p2m facility
> + * and has some different characteristics. It works in a similar
> + * way as shadow page table (guest table and host table composition),
> + * but is per-vcpu, and of vTLB style
> + *   - per vCPU so no lock is required

What happens when dom0 changes domU's p2m table?  Don't you need to
shoot down existing vEPT tables from a foreign CPU?

> + *   - vTLB style signifies honoring all invalidations, and not
> + * write protection. Unlike ordinary page table, since EPT updates
> + * and invalidations are minimal in a well written VMM, overhead
> + * is also minimized.
> + *
> + * The physical root is loaded directly to L2 sVMCS, without entering
> + * any other host controls. Multiple `cache slots' are maintained
> + * for multiple guest EPTPs, with simple LRU replacement.
> + *
> + * One of the limitations so far, is that it doesn't work with
> + * L0 emulation code, so L1 p2m_mmio_direct on top of L0 p2m_mmio_dm
> + * is not supported as for now.

Is this something you intend to fix before we check it in?

> + */
> +
> +#define VEPT_MAX_SLOTS 8
> +#define VEPT_ALLOCATION_SIZE 512
> +
> +struct vept_slot {
> +    u64               eptp;   /* guest eptp */
> +    mfn_t             root;   /* root of phys table */
> +    struct list_head  list;
> +
> +    struct page_list_head page_list;
> +};
> +
> +struct vept {
> +    struct list_head   used_slots; /* lru: new->tail, old->head */
> +    struct list_head   free_slots;
> +
> +    int                total_pages;
> +    int                free_pages;
> +    struct page_list_head freelist;
> +
> +    struct vcpu       *vcpu;
> +};
> +
> +
> +static struct vept_slot *__get_eptp_slot(struct vept *vept, u64 geptp)
> +{
> +    struct vept_slot *slot, *tmp;
> +
> +    list_for_each_entry_safe( slot, tmp, &vept->used_slots, list )
> +        if ( slot->eptp == geptp )
> +            return slot;
> +
> +    return NULL;
> +}
> +
> +static struct vept_slot *get_eptp_slot(struct vept *vept, u64 geptp)
> +{
> +    struct vept_slot *slot;
> +
> +    slot = __get_eptp_slot(vept, geptp);
> +    if ( slot != NULL )
> +        list_del(&slot->list);
> +
> +    return slot;
> +}
> +
> +static void __clear_slot(struct vept *vept, struct vept_slot *slot)
> +{
> +    struct page_info *pg;
> +
> +    slot->eptp = 0;
> +
> +    while ( !page_list_empty(&slot->page_list) )
> +    {
> +        pg = page_list_remove_head(&slot->page_list);
> +        page_list_add_tail(pg, &vept->freelist);
> +
> +        vept->free_pages++;
> +    }
> +}
> +
> +static struct vept_slot *get_free_slot(struct vept *vept)
> +{
> +    struct vept_slot *slot = NULL;
> +
> +    if ( !list_empty(&vept->free_slots) )
> +    {
> +        slot = list_entry(vept->free_slots.next, struct vept_slot, list);
> +        list_del(&slot->list);
> +    }
> +    else if ( !list_empty(&vept->used_slots) )
> +    {
> +        slot = list_entry(vept->used_slots.next, struct vept_slot, list);
> +        list_del(&slot->list);
> +        __clear_slot(vept, slot);
> +    }
> +
> +    return slot;
> +}
> +
> +static void clear_all_slots(struct vept *vept)
> +{
> +    struct vept_slot *slot, *tmp;
> +
> +    list_for_each_entry_safe( slot, tmp, &vept->used_slots, list )
> +    {
> +        list_del(&slot->list);
> +        __clear_slot(vept, slot);
> +        list_add_tail(&slot->list, &vept->free_slots);
> +    }
> +}
> +
> +static int free_some_pages(struct vept *vept, struct vept_slot *curr)
> +{
> +    struct vept_slot *slot;
> +    int r = 0;
> +
> +    if ( !list_empty(&vept->used_slots) )
> +    {
> +        slot = list_entry(vept->used_slots.next, struct vept_slot, list);
> +        if ( slot != curr )
> +        {
> +            list_del(&slot->list);
> +            __clear_slot(vept, slot);
> +            list_add_tail(&slot->list, &vept->free_slots);
> +
> +            r = 1;
> +        }
> +    }
> +
> +    return r;
> +}
> +
> +struct vept *vept_init(struct vcpu *v)
> +{
> +    struct vept *vept;
> +    struct vept_slot *slot;
> +    struct page_info *pg;
> +    int i;
> +
> +    vept = xmalloc(struct vept);
> +    if ( vept == NULL )
> +        goto out;
> +
> +    memset(vept, 0, sizeof(*vept));
> +    vept->vcpu = v;
> +
> +    INIT_PAGE_LIST_HEAD(&vept->freelist);
> +    INIT_LIST_HEAD(&vept->used_slots);
> +    INIT_LIST_HEAD(&vept->free_slots);
> +
> +    for ( i = 0; i < VEPT_MAX_SLOTS; i++ )
> +    {
> +        slot = xmalloc(struct vept_slot);
> +        if ( slot == NULL )
> +            break;
> +
> +        memset(slot, 0, sizeof(*slot));
> +
> +        INIT_LIST_HEAD(&slot->list);
> +        INIT_PAGE_LIST_HEAD(&slot->page_list);
> +
> +        list_add(&slot->list, &vept->free_slots);
> +    }
> +
> +    for ( i = 0; i < VEPT_ALLOCATION_SIZE; i++ )

Why a fixed 2MB allocation?  What if your nested domains are very large?

> +    {
> +        pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(v->domain)));

Shouldn't this be allocated from the paging pool like other EPT memory?

> +        if ( pg == NULL )
> +            break;

Return an error?

> +        page_list_add_tail(pg, &vept->freelist);
> +        vept->total_pages++;
> +        vept->free_pages++;
> +    }
> +
> + out:
> +    return vept;
> +}
> +
> +void vept_teardown(struct vept *vept)
> +{
> +    struct page_info *pg;
> +    struct vept_slot *slot, *tmp;
> +
> +    clear_all_slots(vept);
> +
> +    while ( !page_list_empty(&vept->freelist) )
> +    {
> +        pg = page_list_remove_head(&vept->freelist);
> +        free_domheap_page(pg);
> +        vept->free_pages++;
> +        vept->total_pages++;
> +    }
> +
> +    list_for_each_entry_safe( slot, tmp, &vept->free_slots, list )
> +        xfree(slot);
> +
> +    xfree(vept);
> +}
> +
> +mfn_t vept_load_eptp(struct vept *vept, u64 geptp)
> +{
> +    struct page_info *pg;
> +    struct vept_slot *slot;
> +    mfn_t mfn = _mfn(INVALID_MFN);
> +    void *addr;
> +
> +    ASSERT(vept->vcpu == current);
> +
> +    slot = get_eptp_slot(vept, geptp);
> +    if ( slot == NULL )
> +    {
> +        slot = get_free_slot(vept);
> +        if ( unlikely(slot == NULL) )
> +        {
> +            gdprintk(XENLOG_ERR, "nest: can't get free slot\n");
> +            return mfn;
> +        }
> +
> +        while ( !vept->free_pages )
> +            if ( !free_some_pages(vept, slot) )
> +            {
> +                slot->eptp = 0;
> +                list_add_tail(&slot->list, &vept->free_slots);
> +                gdprintk(XENLOG_ERR, "nest: vept no free pages\n");
> +
> +                return mfn;
> +            }
> +
> +        vept->free_pages--;
> +        pg = page_list_remove_head(&vept->freelist);
> +
> +        mfn = page_to_mfn(pg);
> +        addr = map_domain_page(mfn_x(mfn));
> +        clear_page(addr);
> +        unmap_domain_page(addr);
> +        page_list_add_tail(pg, &slot->page_list);
> +        slot->eptp = geptp;
> +        slot->root = mfn;
> +    }
> +
> +    mfn = slot->root;
> +    list_add_tail(&slot->list, &vept->used_slots);
> +
> +    return mfn;
> +}
> +
> +mfn_t vept_invalidate(struct vept *vept, u64 geptp)
> +{
> +    struct vept_slot *slot;
> +    mfn_t mfn = _mfn(INVALID_MFN);
> +
> +    ASSERT(vept->vcpu == current);
> +
> +    slot = get_eptp_slot(vept, geptp);
> +    if ( slot != NULL )
> +    {
> +        mfn = slot->root;
> +        __clear_slot(vept, slot);
> +        list_add_tail(&slot->list, &vept->free_slots);
> +    }
> +
> +    return mfn;
> +}
> +
> +void vept_invalidate_all(struct vept *vept)
> +{
> +    ASSERT(vept->vcpu == current);
> +
> +    clear_all_slots(vept);
> +}
> +
> +/*
> + * guest EPT walk and EPT violation
> + */
> +struct ept_walk {
> +    unsigned long gfn;
> +    unsigned long gfn_remainder;
> +    ept_entry_t l4e, l3e, l2e, l1e;
> +    mfn_t l4mfn, l3mfn, l2mfn, l1mfn;
> +    int sp;
> +};
> +typedef struct ept_walk ept_walk_t;
> +
> +#define GEPT_NORMAL_PAGE  0
> +#define GEPT_SUPER_PAGE   1
> +#define GEPT_NOT_PRESENT  2
> +static int guest_ept_next_level(struct vcpu *v, ept_entry_t **table,
> +               unsigned long *gfn_remainder, int level, u32 *ar,
> +               ept_entry_t *entry, mfn_t *next_mfn)
> +{
> +    int index;
> +    ept_entry_t *ept_entry;
> +    ept_entry_t *next;
> +    p2m_type_t p2mt;
> +    int rc = GEPT_NORMAL_PAGE;
> +    mfn_t mfn;
> +
> +    index = *gfn_remainder >> (level * EPT_TABLE_ORDER);
> +
> +    ept_entry = (*table) + index;
> +    *entry = *ept_entry;
> +    *ar &= entry->epte & 0x7;
> +
> +    *gfn_remainder &= (1UL << (level * EPT_TABLE_ORDER)) - 1;
> +
> +    if ( !(ept_entry->epte & 0x7) )
> +        rc = GEPT_NOT_PRESENT;
> +    else if ( ept_entry->sp_avail )
> +        rc = GEPT_SUPER_PAGE;
> +    else
> +    {
> +        mfn = gfn_to_mfn(v->domain, ept_entry->mfn, &p2mt);
> +        if ( !p2m_is_ram(p2mt) )
> +            return GEPT_NOT_PRESENT;
> +
> +        if ( next_mfn )
> +        {
> +            next = map_domain_page(mfn_x(mfn));
> +            unmap_domain_page(*table);
> +
> +            *table = next;
> +            *next_mfn = mfn;
> +        }
> +    }
> +
> +    return rc;
> +}
> +
> +static u32 guest_walk_ept(struct vcpu *v, ept_walk_t *gw,
> +                          u64 geptp, u64 ggpa)
> +{
> +    ept_entry_t *table;
> +    p2m_type_t p2mt;
> +    int rc;
> +    u32 ar = 0x7;
> +
> +    unsigned long gfn = (unsigned long) (ggpa >> PAGE_SHIFT);
> +    unsigned long gfn_remainder = gfn;
> +
> +    memset(gw, 0, sizeof(*gw));
> +    gw->gfn = gfn;
> +    gw->sp = 0;
> +
> +    gw->l4mfn = gfn_to_mfn(v->domain, geptp >> PAGE_SHIFT, &p2mt);
> +    if ( !p2m_is_ram(p2mt) )
> +        return 0;
> +
> +    table = map_domain_page(mfn_x(gw->l4mfn));
> +
> +    rc = guest_ept_next_level(v, &table, &gfn_remainder, 3, &ar,
> +                              &gw->l4e, &gw->l3mfn);
> +
> +    if ( rc )
> +        goto out;
> +
> +    rc = guest_ept_next_level(v, &table, &gfn_remainder, 2, &ar,
> +                              &gw->l3e, &gw->l2mfn);
> +
> +    if ( rc == GEPT_SUPER_PAGE )
> +        gw->sp = 2;
> +    if ( rc )
> +        goto out;
> +
> +    rc = guest_ept_next_level(v, &table, &gfn_remainder, 1, &ar,
> +                              &gw->l2e, &gw->l1mfn);
> +
> +    if ( rc == GEPT_SUPER_PAGE )
> +        gw->sp = 1;
> +    if ( rc )
> +        goto out;
> +
> +    rc = guest_ept_next_level(v, &table, &gfn_remainder, 0, &ar,
> +                              &gw->l1e, NULL);
> +
> + out:
> +    gw->gfn_remainder = gfn_remainder;
> +    unmap_domain_page(*table);
> +    return ar;
> +}
> +
> +static void epte_set_ar_bits(ept_entry_t *entry, unsigned long ar)
> +{
> +    entry->epte &= ~0x7f;
> +    entry->epte |= ar & 0x7f;
> +}
> +
> +static int shadow_ept_next_level(struct vept *vept, struct vept_slot *slot,
> +                       ept_entry_t **table, unsigned long *gfn_remainder,
> +                       int level, u32 *ar, ept_entry_t gentry)
> +{
> +    int index;
> +    ept_entry_t *sentry;
> +    ept_entry_t *next;
> +    mfn_t mfn;
> +    struct page_info *pg;
> +
> +    index = *gfn_remainder >> (level * EPT_TABLE_ORDER);
> +
> +    sentry = (*table) + index;
> +    *ar = sentry->epte & 0x7;
> +
> +    *gfn_remainder &= (1UL << (level * EPT_TABLE_ORDER)) - 1;
> +
> +    if ( !(sentry->epte & 0x7) )
> +    {
> +        while ( !vept->free_pages )
> +            if ( !free_some_pages(vept, slot) )
> +            {
> +                gdprintk(XENLOG_ERR, "nest: vept no free pages\n");
> +                return 0;
> +            }
> +
> +        vept->free_pages--;
> +        pg = page_list_remove_head(&vept->freelist);
> +        page_list_add_tail(pg, &slot->page_list);
> +        mfn = page_to_mfn(pg);
> +        next = map_domain_page(mfn_x(mfn));
> +        clear_page(next);
> +
> +        sentry->mfn = mfn_x(mfn);
> +    }
> +    else
> +    {
> +        next = map_domain_page(sentry->mfn);
> +    }
> +
> +    epte_set_ar_bits(sentry, gentry.epte);
> +
> +    unmap_domain_page(*table);
> +    *table = next;
> +
> +    return 1;
> +}
> +
> +int vept_ept_violation(struct vept *vept, u64 geptp,
> +                       unsigned long qualification, paddr_t addr)
> +{
> +    ept_walk_t gw;
> +    struct vept_slot *slot;
> +    ept_entry_t *table, *gept;
> +    ept_entry_t *sentry, *gentry;
> +    u32 old_entry, sp_ar = 0;
> +    p2m_type_t p2mt;
> +    unsigned long mfn_start = 0;
> +    unsigned long gfn_remainder;
> +    int rc, i;
> +
> +    ASSERT(vept->vcpu == current);
> +
> +    slot = __get_eptp_slot(vept, geptp);
> +    if ( unlikely(slot == NULL) )
> +        return 0;
> +
> +    rc = guest_walk_ept(vept->vcpu, &gw, geptp, addr);
> +
> +    if ( !(rc & (qualification & 0x7)) )    /* inject to guest */
> +        return 1;
> +
> +    if ( gw.sp == 2 )  /* 1G */
> +    {
> +        sp_ar = gw.l3e.epte & 0x7;
> +        mfn_start = gw.l3e.mfn +
> +                    (gw.gfn_remainder & (~(1 << EPT_TABLE_ORDER) - 1));
> +    }
> +    if ( gw.sp == 1 )  /* 2M */
> +    {
> +        sp_ar = gw.l2e.epte & 0x7;
> +        mfn_start = gw.l2e.mfn;
> +    }
> +    else
> +        mfn_start = 0;
> +
> +    table = map_domain_page(mfn_x(slot->root));
> +    gfn_remainder = gw.gfn;
> +
> +    shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 3,
> +                          &old_entry, gw.l4e);

What if shadow_ept_next_level() returns 0 ?

> +    shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 2,
> +                          &old_entry, gw.l3e);

Ditto

> +    shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 1,
> +                          &old_entry, (gw.sp == 2) ? gw.l3e : gw.l2e);

Ditto

> +    /* if l1p is just allocated, do a full prefetch */
> +    if ( !old_entry && !gw.sp )
> +    {
> +        gept = map_domain_page(mfn_x(gw.l1mfn));
> +        for ( i = 0; i < 512; i++ )
> +        {
> +            gentry = gept + i;
> +            sentry = table + i;
> +            if ( gentry->epte & 0x7 )
> +            {
> +                sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
> +                                        gentry->mfn, &p2mt));
> +                epte_set_ar_bits(sentry, gentry->epte);
> +            }
> +            else
> +                sentry->epte = 0;
> +        }
> +        unmap_domain_page(gept);
> +    }
> +    else if ( !old_entry && gw.sp )
> +    {
> +        for ( i = 0; i < 512; i++ )
> +        {
> +            sentry = table + i;
> +            sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
> +                                    mfn_start + i, &p2mt));
> +            epte_set_ar_bits(sentry, sp_ar);
> +        }
> +    }
> +    else if ( old_entry && !gw.sp )
> +    {
> +        i = gw.gfn & ((1 << EPT_TABLE_ORDER) - 1);
> +        sentry = table + i;
> +        sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
> +                                gw.l1e.mfn, &p2mt));
> +        epte_set_ar_bits(sentry, gw.l1e.epte);
> +    }
> +    else    // old_entry && gw.sp
> +    {
> +        i = gw.gfn & ((1 << EPT_TABLE_ORDER) - 1);
> +        sentry = table + i;
> +        sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
> +                                mfn_start + i, &p2mt));
> +        epte_set_ar_bits(sentry, sp_ar);
> +    }
> +
> +    unmap_domain_page(table);
> +    return 0;
> +}
> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/vmx.c
> --- a/xen/arch/x86/hvm/vmx/vmx.c        Thu Apr 22 22:30:09 2010 +0800
> +++ b/xen/arch/x86/hvm/vmx/vmx.c        Thu Apr 22 22:30:10 2010 +0800
> @@ -1032,6 +1032,14 @@
>      p2m_type_t p2mt;
>      char *p;
> 
> +    /*
> +     * If in nesting EPT operation, L0 doesn't have the knowledge on
> +     * how to interpret CR3, it's L1's responsibility to provide
> +     * GUEST_PDPTRn, we rely solely on them.
> +     */
> +    if ( v->arch.hvm_vcpu.in_nesting && vmx_nest_vept(v) )
> +        return;
> +
>      /* EPT needs to load PDPTRS into VMCS for PAE. */
>      if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
>          return;
> @@ -2705,6 +2713,11 @@
>          if ( vmx_nest_handle_vmxon(regs) == X86EMUL_OKAY )
>              __update_guest_eip(inst_len);
>          break;
> +    case EXIT_REASON_INVEPT:
> +        inst_len = __get_instruction_length();
> +        if ( vmx_nest_handle_invept(regs) == X86EMUL_OKAY )
> +            __update_guest_eip(inst_len);
> +        break;
> 
>      case EXIT_REASON_MWAIT_INSTRUCTION:
>      case EXIT_REASON_MONITOR_INSTRUCTION:
> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/include/asm-x86/hvm/vmx/nest.h
> --- a/xen/include/asm-x86/hvm/vmx/nest.h        Thu Apr 22 22:30:09 2010 +0800
> +++ b/xen/include/asm-x86/hvm/vmx/nest.h        Thu Apr 22 22:30:10 2010 +0800
> @@ -47,6 +47,9 @@
> 
>      unsigned long        intr_info;
>      unsigned long        error_code;
> +
> +    u64                  geptp;
> +    struct vept         *vept;
>  };
> 
>  asmlinkage void vmx_nest_switch_mode(void);
> @@ -64,6 +67,8 @@
>  int vmx_nest_handle_vmresume(struct cpu_user_regs *regs);
>  int vmx_nest_handle_vmlaunch(struct cpu_user_regs *regs);
> 
> +int vmx_nest_handle_invept(struct cpu_user_regs *regs);
> +
>  void vmx_nest_update_exec_control(struct vcpu *v, unsigned long value);
>  void vmx_nest_update_secondary_exec_control(struct vcpu *v,
>                                              unsigned long value);
> @@ -81,4 +86,6 @@
>  int vmx_nest_msr_write_intercept(struct cpu_user_regs *regs,
>                                   u64 msr_content);
> 
> +int vmx_nest_vept(struct vcpu *v);
> +
>  #endif /* __ASM_X86_HVM_NEST_H__ */
> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/include/asm-x86/hvm/vmx/vept.h
> --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/include/asm-x86/hvm/vmx/vept.h        Thu Apr 22 22:30:10 2010 +0800
> @@ -0,0 +1,10 @@
> +#include <asm/hvm/vmx/vmx.h>
> +
> +
> +struct vept *vept_init(struct vcpu *v);
> +void vept_teardown(struct vept *vept);
> +mfn_t vept_load_eptp(struct vept *vept, u64 eptp);
> +mfn_t vept_invalidate(struct vept *vept, u64 eptp);
> +void vept_invalidate_all(struct vept *vept);
> +int vept_ept_violation(struct vept *vept, u64 eptp,
> +                       unsigned long qualification, paddr_t addr);
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxxxxxxxx
> http://lists.xensource.com/xen-devel

-- 
Tim Deegan <Tim.Deegan@xxxxxxxxxx>
Principal Software Engineer, XenServer Engineering
Citrix Systems UK Ltd.  (Company #02937203, SL9 0BG)

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.