[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [V10 PATCH 19/23] PVH xen: vmcs related changes



On Wed, Jul 24, 2013 at 2:59 AM, Mukesh Rathor <mukesh.rathor@xxxxxxxxxx> wrote:
> This patch contains vmcs changes related for PVH, mainly creating a VMCS
> for PVH guest.
>
> Signed-off-by: Mukesh Rathor <mukesh.rathor@xxxxxxxxxx>
> ---
>  xen/arch/x86/hvm/vmx/vmcs.c |  247 
> ++++++++++++++++++++++++++++++++++++++++++-
>  1 files changed, 245 insertions(+), 2 deletions(-)
>
> diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
> index 36f167f..8d35370 100644
> --- a/xen/arch/x86/hvm/vmx/vmcs.c
> +++ b/xen/arch/x86/hvm/vmx/vmcs.c
> @@ -634,7 +634,7 @@ void vmx_vmcs_exit(struct vcpu *v)
>      {
>          /* Don't confuse vmx_do_resume (for @v or @current!) */
>          vmx_clear_vmcs(v);
> -        if ( is_hvm_vcpu(current) )
> +        if ( !is_pv_vcpu(current) )
>              vmx_load_vmcs(current);
>
>          spin_unlock(&v->arch.hvm_vmx.vmcs_lock);
> @@ -856,6 +856,239 @@ static void vmx_set_common_host_vmcs_fields(struct vcpu 
> *v)
>      __vmwrite(HOST_SYSENTER_EIP, sysenter_eip);
>  }
>
> +static int pvh_check_requirements(struct vcpu *v)
> +{
> +    u64 required, tmpval = real_cr4_to_pv_guest_cr4(mmu_cr4_features);
> +
> +    if ( !paging_mode_hap(v->domain) )
> +    {
> +        printk(XENLOG_G_INFO "HAP is required for PVH guest.\n");
> +        return -EINVAL;
> +    }
> +    if ( !cpu_has_vmx_pat )
> +    {
> +        printk(XENLOG_G_INFO "PVH: CPU does not have PAT support\n");
> +        return -ENOSYS;
> +    }
> +    if ( !cpu_has_vmx_msr_bitmap )
> +    {
> +        printk(XENLOG_G_INFO "PVH: CPU does not have msr bitmap\n");
> +        return -ENOSYS;
> +    }
> +    if ( !cpu_has_vmx_vpid )
> +    {
> +        printk(XENLOG_G_INFO "PVH: CPU doesn't have VPID support\n");
> +        return -ENOSYS;
> +    }
> +    if ( !cpu_has_vmx_secondary_exec_control )
> +    {
> +        printk(XENLOG_G_INFO "CPU Secondary exec is required to run PVH\n");
> +        return -ENOSYS;
> +    }
> +
> +    if ( v->domain->arch.vtsc )
> +    {
> +        printk(XENLOG_G_INFO
> +               "At present PVH only supports the default timer mode\n");
> +        return -ENOSYS;
> +    }
> +
> +    required = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSFXSR;
> +    if ( (tmpval & required) != required )
> +    {
> +        printk(XENLOG_G_INFO "PVH: required CR4 features not 
> available:%lx\n",
> +               required);
> +        return -ENOSYS;
> +    }
> +
> +    return 0;
> +}
> +
> +static int pvh_construct_vmcs(struct vcpu *v)
> +{
> +    int rc, msr_type;
> +    unsigned long *msr_bitmap;
> +    struct domain *d = v->domain;
> +    struct p2m_domain *p2m = p2m_get_hostp2m(d);
> +    struct ept_data *ept = &p2m->ept;
> +    u32 vmexit_ctl = vmx_vmexit_control;
> +    u32 vmentry_ctl = vmx_vmentry_control;
> +    u64 host_pat, tmpval = -1;
> +
> +    if ( (rc = pvh_check_requirements(v)) )
> +        return rc;
> +
> +    msr_bitmap = alloc_xenheap_page();
> +    if ( msr_bitmap == NULL )
> +        return -ENOMEM;
> +
> +    /* 1. Pin-Based Controls: */
> +    __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
> +
> +    v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
> +
> +    /* 2. Primary Processor-based controls: */
> +    /*
> +     * If rdtsc exiting is turned on and it goes thru emulate_privileged_op,
> +     * then pv_vcpu.ctrlreg must be added to the pvh struct.
> +     */
> +    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING;
> +    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING;
> +
> +    v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING |
> +                                      CPU_BASED_CR3_LOAD_EXITING |
> +                                      CPU_BASED_CR3_STORE_EXITING);
> +    v->arch.hvm_vmx.exec_control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> +    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
> +    v->arch.hvm_vmx.exec_control |= CPU_BASED_ACTIVATE_MSR_BITMAP;
> +    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_TPR_SHADOW;
> +    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
> +
> +    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
> +
> +    /* 3. Secondary Processor-based controls (Intel SDM: resvd bits are 0): 
> */
> +    v->arch.hvm_vmx.secondary_exec_control = SECONDARY_EXEC_ENABLE_EPT;
> +    v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_VPID;
> +    v->arch.hvm_vmx.secondary_exec_control |= 
> SECONDARY_EXEC_PAUSE_LOOP_EXITING;
> +
> +    __vmwrite(SECONDARY_VM_EXEC_CONTROL,
> +              v->arch.hvm_vmx.secondary_exec_control);
> +
> +    __vmwrite(IO_BITMAP_A, virt_to_maddr((char *)hvm_io_bitmap + 0));
> +    __vmwrite(IO_BITMAP_B, virt_to_maddr((char *)hvm_io_bitmap + PAGE_SIZE));
> +
> +    /* MSR bitmap for intercepts. */
> +    memset(msr_bitmap, ~0, PAGE_SIZE);
> +    v->arch.hvm_vmx.msr_bitmap = msr_bitmap;
> +    __vmwrite(MSR_BITMAP, virt_to_maddr(msr_bitmap));
> +
> +    msr_type = MSR_TYPE_R | MSR_TYPE_W;
> +    /* Disable interecepts for MSRs that have corresponding VMCS fields. */
> +    vmx_disable_intercept_for_msr(v, MSR_FS_BASE, msr_type);
> +    vmx_disable_intercept_for_msr(v, MSR_GS_BASE, msr_type);
> +    vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, msr_type);
> +    vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, msr_type);
> +    vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, msr_type);
> +    vmx_disable_intercept_for_msr(v, MSR_SHADOW_GS_BASE, msr_type);
> +    vmx_disable_intercept_for_msr(v, MSR_IA32_CR_PAT, msr_type);
> +
> +    /*
> +     * We don't disable intercepts for MSRs: MSR_STAR, MSR_LSTAR, MSR_CSTAR,
> +     * and MSR_SYSCALL_MASK because we need to specify save/restore area to
> +     * save/restore at every VM exit and entry. Instead, let the intercept
> +     * functions save them into vmx_msr_state fields. See comment in
> +     * vmx_restore_host_msrs(). See also vmx_restore_guest_msrs().
> +     */
> +    __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
> +    __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
> +    __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
> +
> +    __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl);
> +
> +    /*
> +     * Note: we run with default VM_ENTRY_LOAD_DEBUG_CTLS of 1, which means
> +     * upon vmentry, the cpu reads/loads VMCS.DR7 and VMCS.DEBUGCTLS, and not
> +     * use the host values. 0 would cause it to not use the VMCS values.
> +     */
> +    vmentry_ctl &= ~VM_ENTRY_LOAD_GUEST_EFER;
> +    vmentry_ctl &= ~VM_ENTRY_SMM;
> +    vmentry_ctl &= ~VM_ENTRY_DEACT_DUAL_MONITOR;
> +    /* PVH 32bitfixme. */
> +    vmentry_ctl |= VM_ENTRY_IA32E_MODE;       /* GUEST_EFER.LME/LMA ignored 
> */
> +
> +    __vmwrite(VM_ENTRY_CONTROLS, vmentry_ctl);
> +
> +    vmx_set_common_host_vmcs_fields(v);
> +
> +    __vmwrite(VM_ENTRY_INTR_INFO, 0);
> +    __vmwrite(CR3_TARGET_COUNT, 0);
> +    __vmwrite(GUEST_ACTIVITY_STATE, 0);
> +
> +    /* These are sorta irrelevant as we load the discriptors directly. */
> +    __vmwrite(GUEST_CS_SELECTOR, 0);
> +    __vmwrite(GUEST_DS_SELECTOR, 0);
> +    __vmwrite(GUEST_SS_SELECTOR, 0);
> +    __vmwrite(GUEST_ES_SELECTOR, 0);
> +    __vmwrite(GUEST_FS_SELECTOR, 0);
> +    __vmwrite(GUEST_GS_SELECTOR, 0);
> +
> +    __vmwrite(GUEST_CS_BASE, 0);
> +    __vmwrite(GUEST_CS_LIMIT, ~0u);
> +    /* CS.L == 1, exec, read/write, accessed. PVH 32bitfixme. */
> +    __vmwrite(GUEST_CS_AR_BYTES, 0xa09b);
> +
> +    __vmwrite(GUEST_DS_BASE, 0);
> +    __vmwrite(GUEST_DS_LIMIT, ~0u);
> +    __vmwrite(GUEST_DS_AR_BYTES, 0xc093); /* read/write, accessed */
> +
> +    __vmwrite(GUEST_SS_BASE, 0);
> +    __vmwrite(GUEST_SS_LIMIT, ~0u);
> +    __vmwrite(GUEST_SS_AR_BYTES, 0xc093); /* read/write, accessed */
> +
> +    __vmwrite(GUEST_ES_BASE, 0);
> +    __vmwrite(GUEST_ES_LIMIT, ~0u);
> +    __vmwrite(GUEST_ES_AR_BYTES, 0xc093); /* read/write, accessed */
> +
> +    __vmwrite(GUEST_FS_BASE, 0);
> +    __vmwrite(GUEST_FS_LIMIT, ~0u);
> +    __vmwrite(GUEST_FS_AR_BYTES, 0xc093); /* read/write, accessed */
> +
> +    __vmwrite(GUEST_GS_BASE, 0);
> +    __vmwrite(GUEST_GS_LIMIT, ~0u);
> +    __vmwrite(GUEST_GS_AR_BYTES, 0xc093); /* read/write, accessed */
> +
> +    __vmwrite(GUEST_GDTR_BASE, 0);
> +    __vmwrite(GUEST_GDTR_LIMIT, 0);
> +
> +    __vmwrite(GUEST_LDTR_BASE, 0);
> +    __vmwrite(GUEST_LDTR_LIMIT, 0);
> +    __vmwrite(GUEST_LDTR_AR_BYTES, 0x82); /* LDT */
> +    __vmwrite(GUEST_LDTR_SELECTOR, 0);
> +
> +    /* Guest TSS. */
> +    __vmwrite(GUEST_TR_BASE, 0);
> +    __vmwrite(GUEST_TR_LIMIT, 0xff);
> +    __vmwrite(GUEST_TR_AR_BYTES, 0x8b); /* 32-bit TSS (busy) */
> +
> +    __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
> +    __vmwrite(GUEST_DR7, 0);
> +    __vmwrite(VMCS_LINK_POINTER, ~0UL);
> +
> +    __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
> +    __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0);
> +
> +    v->arch.hvm_vmx.exception_bitmap = HVM_TRAP_MASK | (1U << TRAP_debug) |
> +                                   (1U << TRAP_int3) | (1U << 
> TRAP_no_device);
> +    __vmwrite(EXCEPTION_BITMAP, v->arch.hvm_vmx.exception_bitmap);
> +
> +    /* Set WP bit so rdonly pages are not written from CPL 0. */
> +    tmpval = X86_CR0_PG | X86_CR0_NE | X86_CR0_PE | X86_CR0_WP;
> +    __vmwrite(GUEST_CR0, tmpval);
> +    __vmwrite(CR0_READ_SHADOW, tmpval);
> +    v->arch.hvm_vcpu.hw_cr[0] = v->arch.hvm_vcpu.guest_cr[0] = tmpval;
> +
> +    tmpval = real_cr4_to_pv_guest_cr4(mmu_cr4_features);
> +    __vmwrite(GUEST_CR4, tmpval);
> +    __vmwrite(CR4_READ_SHADOW, tmpval);
> +    v->arch.hvm_vcpu.guest_cr[4] = tmpval;
> +
> +    __vmwrite(CR0_GUEST_HOST_MASK, ~0UL);
> +    __vmwrite(CR4_GUEST_HOST_MASK, ~0UL);
> +
> +     v->arch.hvm_vmx.vmx_realmode = 0;
> +
> +    ept->asr  = pagetable_get_pfn(p2m_get_pagetable(p2m));
> +    __vmwrite(EPT_POINTER, ept_get_eptp(ept));
> +
> +    rdmsrl(MSR_IA32_CR_PAT, host_pat);
> +    __vmwrite(HOST_PAT, host_pat);
> +    __vmwrite(GUEST_PAT, MSR_IA32_CR_PAT_RESET);
> +
> +    /* The paging mode is updated for PVH by arch_set_info_guest(). */
> +
> +    return 0;
> +}

The majority of this function seems to be duplicating code in
construct_vmcs(), but in a different order so that it's very difficult
to tell which is which.  Wouldn't it be better to just sprinkle
if(is_pvh_domain()) around consrtuct_vmcs?

Duplicating the code like this not only makes it more difficult to
read, masking potential mistakes in making the new function, but it
also means that two places will need to be modified to take advantage
of any new features.

 -George

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.