[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 9/18 V2]: PVH xen: create PVH vmcs, and initialization



On Fri, Mar 15, 2013 at 05:39:25PM -0700, Mukesh Rathor wrote:
> This patch mainly contains code to create a VMCS for PVH guest, and HVM
>  specific vcpu/domain creation code.
> 
> Changes in V2:
>   - Avoid call to hvm_do_resume() at call site rather than return in it.
>   - Return for PVH vmx_do_resume prior to intel debugger stuff.
> 
> Signed-off-by: Mukesh Rathor <mukesh.rathor@xxxxxxxxxx>
> ---
>  xen/arch/x86/hvm/hvm.c      |   90 ++++++++++++++-
>  xen/arch/x86/hvm/vmx/vmcs.c |  266 
> ++++++++++++++++++++++++++++++++++++++++++-
>  xen/arch/x86/hvm/vmx/vmx.c  |   34 ++++++
>  3 files changed, 383 insertions(+), 7 deletions(-)
> 
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index ea7adf6..18889ad 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -510,6 +510,29 @@ static int hvm_print_line(
>      return X86EMUL_OKAY;
>  }
>  
> +static int hvm_pvh_dom_initialise(struct domain *d)
> +{
> +    int rc;
> +
> +    if (!d->arch.hvm_domain.hap_enabled)
> +        return -EINVAL;
> +
> +    spin_lock_init(&d->arch.hvm_domain.irq_lock);
> +    hvm_init_guest_time(d);
> +
> +    hvm_init_cacheattr_region_list(d);
> +
> +    if ( (rc=paging_enable(d, PG_refcounts|PG_translate|PG_external)) != 0 )
> +        goto fail1;
> +
> +    if ( (rc = hvm_funcs.domain_initialise(d)) == 0 )
> +        return 0;
> +
> +fail1:

I don't think you need the label here? You are not doing an goto.

> +    hvm_destroy_cacheattr_region_list(d);
> +    return rc;
> +}
> +
>  int hvm_domain_initialise(struct domain *d)
>  {
>      int rc;
> @@ -520,6 +543,8 @@ int hvm_domain_initialise(struct domain *d)
>                   "on a non-VT/AMDV platform.\n");
>          return -EINVAL;
>      }
> +    if ( is_pvh_domain(d) )
> +        return hvm_pvh_dom_initialise(d);
>  
>      spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
>      spin_lock_init(&d->arch.hvm_domain.irq_lock);
> @@ -584,6 +609,11 @@ int hvm_domain_initialise(struct domain *d)
>  
>  void hvm_domain_relinquish_resources(struct domain *d)
>  {
> +    if ( is_pvh_domain(d) ) 
> +    {
> +        pit_deinit(d);
> +        return;
> +    }
>      if ( hvm_funcs.nhvm_domain_relinquish_resources )
>          hvm_funcs.nhvm_domain_relinquish_resources(d);
>  
> @@ -609,10 +639,14 @@ void hvm_domain_relinquish_resources(struct domain *d)
>  void hvm_domain_destroy(struct domain *d)
>  {
>      hvm_funcs.domain_destroy(d);
> +    hvm_destroy_cacheattr_region_list(d);
> +
> +    if ( is_pvh_domain(d) )
> +        return;
> +
>      rtc_deinit(d);
>      stdvga_deinit(d);
>      vioapic_deinit(d);
> -    hvm_destroy_cacheattr_region_list(d);
>  }
>  
>  static int hvm_save_tsc_adjust(struct domain *d, hvm_domain_context_t *h)
> @@ -1066,14 +1100,47 @@ static int __init 
> __hvm_register_CPU_XSAVE_save_and_restore(void)
>  }
>  __initcall(__hvm_register_CPU_XSAVE_save_and_restore);
>  
> +static int hvm_pvh_vcpu_initialise(struct vcpu *v)
> +{
> +    int rc;
> +
> +    if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 )
> +        return rc;
> +
> +    softirq_tasklet_init( &v->arch.hvm_vcpu.assert_evtchn_irq_tasklet,
> +                          (void(*)(unsigned long))hvm_assert_evtchn_irq,
> +                          (unsigned long)v );
> +
> +    v->arch.hvm_vcpu.hcall_64bit = 1;
> +    v->arch.hvm_vcpu.hvm_pvh.vcpu_info_mfn = INVALID_MFN;
> +    v->arch.user_regs.eflags = 2;

So that sets the Reserved flag. Could you include a comment explaining why..
Ah, is it b/c we later on bit-shift it and use it to figure out whether IOPL
needs to be virtualized in arch_set_info_guest? Or is it just b/c
this function is based off hvm_vcpu_initialise? If so, since you are
being called by it, can you skip it?


> +    v->arch.hvm_vcpu.inject_trap.vector = -1;
> +
> +    if ( (rc=hvm_vcpu_cacheattr_init(v)) != 0 ) {

The syntax here is off.

> +        hvm_funcs.vcpu_destroy(v);
> +        return rc;
> +    }
> +
> +    /* during domain shutdown: pvh_vmx_vmexit_handler->emulate_privileged_op
> +     * -> guest_io_read -> pv_pit_handler -> handle_speaker_io -> _spin_lock
> +     *  so we call pit_init to initialize the spin lock */
> +    if ( v->vcpu_id == 0 )
> +        pit_init(v, cpu_khz);
> +
> +    return 0;
> +}
> +
>  int hvm_vcpu_initialise(struct vcpu *v)
>  {
>      int rc;
>      struct domain *d = v->domain;
> -    domid_t dm_domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN];
> +    domid_t dm_domid; 

Not sure I follow, why the move of it further down?
>  
>      hvm_asid_flush_vcpu(v);
>  
> +    if ( is_pvh_vcpu(v) )
> +        return hvm_pvh_vcpu_initialise(v);
> +
>      if ( (rc = vlapic_init(v)) != 0 )
>          goto fail1;
>  
> @@ -1084,6 +1151,8 @@ int hvm_vcpu_initialise(struct vcpu *v)
>           && (rc = nestedhvm_vcpu_initialise(v)) < 0 ) 
>          goto fail3;
>  
> +    dm_domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN];
> +

?

>      /* Create ioreq event channel. */
>      rc = alloc_unbound_xen_event_channel(v, dm_domid, NULL);
>      if ( rc < 0 )
> @@ -1163,7 +1232,10 @@ void hvm_vcpu_destroy(struct vcpu *v)
>  
>      tasklet_kill(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet);
>      hvm_vcpu_cacheattr_destroy(v);
> -    vlapic_destroy(v);
> +
> +    if ( !is_pvh_vcpu(v) )
> +        vlapic_destroy(v);
> +
>      hvm_funcs.vcpu_destroy(v);
>  
>      /* Event channel is already freed by evtchn_destroy(). */
> @@ -4514,6 +4586,8 @@ static int hvm_memory_event_traps(long p, uint32_t 
> reason,
>  
>  void hvm_memory_event_cr0(unsigned long value, unsigned long old) 
>  {
> +    if ( is_pvh_vcpu(current) )
> +        return;
>      hvm_memory_event_traps(current->domain->arch.hvm_domain
>                               .params[HVM_PARAM_MEMORY_EVENT_CR0],
>                             MEM_EVENT_REASON_CR0,
> @@ -4522,6 +4596,8 @@ void hvm_memory_event_cr0(unsigned long value, unsigned 
> long old)
>  
>  void hvm_memory_event_cr3(unsigned long value, unsigned long old) 
>  {
> +    if ( is_pvh_vcpu(current) )
> +        return;
>      hvm_memory_event_traps(current->domain->arch.hvm_domain
>                               .params[HVM_PARAM_MEMORY_EVENT_CR3],
>                             MEM_EVENT_REASON_CR3,
> @@ -4530,6 +4606,8 @@ void hvm_memory_event_cr3(unsigned long value, unsigned 
> long old)
>  
>  void hvm_memory_event_cr4(unsigned long value, unsigned long old) 
>  {
> +    if ( is_pvh_vcpu(current) )
> +        return;
>      hvm_memory_event_traps(current->domain->arch.hvm_domain
>                               .params[HVM_PARAM_MEMORY_EVENT_CR4],
>                             MEM_EVENT_REASON_CR4,
> @@ -4538,6 +4616,8 @@ void hvm_memory_event_cr4(unsigned long value, unsigned 
> long old)
>  
>  void hvm_memory_event_msr(unsigned long msr, unsigned long value)
>  {
> +    if ( is_pvh_vcpu(current) )
> +        return;
>      hvm_memory_event_traps(current->domain->arch.hvm_domain
>                               .params[HVM_PARAM_MEMORY_EVENT_MSR],
>                             MEM_EVENT_REASON_MSR,
> @@ -4550,6 +4630,8 @@ int hvm_memory_event_int3(unsigned long gla)
>      unsigned long gfn;
>      gfn = paging_gva_to_gfn(current, gla, &pfec);
>  
> +    if ( is_pvh_vcpu(current) )
> +        return 0;
>      return hvm_memory_event_traps(current->domain->arch.hvm_domain
>                                      .params[HVM_PARAM_MEMORY_EVENT_INT3],
>                                    MEM_EVENT_REASON_INT3,
> @@ -4562,6 +4644,8 @@ int hvm_memory_event_single_step(unsigned long gla)
>      unsigned long gfn;
>      gfn = paging_gva_to_gfn(current, gla, &pfec);
>  
> +    if ( is_pvh_vcpu(current) )
> +        return 0;
>      return hvm_memory_event_traps(current->domain->arch.hvm_domain
>              .params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP],
>              MEM_EVENT_REASON_SINGLESTEP,
> diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
> index 9926ffb..b0bea9c 100644
> --- a/xen/arch/x86/hvm/vmx/vmcs.c
> +++ b/xen/arch/x86/hvm/vmx/vmcs.c
> @@ -624,7 +624,7 @@ void vmx_vmcs_exit(struct vcpu *v)
>      {
>          /* Don't confuse vmx_do_resume (for @v or @current!) */
>          vmx_clear_vmcs(v);
> -        if ( is_hvm_vcpu(current) )
> +        if ( is_hvm_or_pvh_vcpu(current) )
>              vmx_load_vmcs(current);
>  
>          spin_unlock(&v->arch.hvm_vmx.vmcs_lock);
> @@ -815,6 +815,253 @@ void virtual_vmcs_vmwrite(void *vvmcs, u32 
> vmcs_encoding, u64 val)
>      virtual_vmcs_exit(vvmcs);
>  }
>  
> +static int pvh_construct_vmcs(struct vcpu *v)
> +{
> +    uint16_t sysenter_cs;
> +    unsigned long sysenter_eip;
> +    struct domain *d = v->domain;
> +    struct p2m_domain *p2m = p2m_get_hostp2m(d);
> +    struct ept_data *ept = &p2m->ept;
> +    u32 vmexit_ctl = vmx_vmexit_control;
> +    u32 vmentry_ctl = vmx_vmentry_control;
> +    u64 required, tmpval = -1;
> +
> +    if ( !paging_mode_hap(d) )
> +    {
> +        printk("ERROR: HAP is required to run PV in HVM container\n");
> +        return -EINVAL;
> +    }
> +
> +    /* VMCS controls. */
> +    vmx_pin_based_exec_control &= ~PIN_BASED_VIRTUAL_NMIS;
> +    __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
> +
> +    v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
> +
> +    /* if rdtsc exiting is turned on and it goes thru emulate_privileged_op,
> +     * then pv_vcpu.ctrlreg must be added to pvh struct */

That would be the 'timer_mode' syntax in the guest config right? Perhaps then
a check at the top of the function to see which timer_mode is used and
exit out with -ENOSYS?


> +    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING;
> +    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING;
> +
> +    v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING |
> +                                      CPU_BASED_CR3_LOAD_EXITING |
> +                                      CPU_BASED_CR3_STORE_EXITING);
> +    v->arch.hvm_vmx.exec_control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> +    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;

Is that b/c the PV code ends up making the SCHED_yield_op hypercall and
we don't need the monitor/mwait capability? If so, could you add that
comment in please?

> +    v->arch.hvm_vmx.exec_control |= CPU_BASED_ACTIVATE_MSR_BITMAP;
> +    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_TPR_SHADOW;
> +    v->arch.hvm_vmx.exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
> +
> +    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
> +
> +    /* I/O access bitmap. */
> +    __vmwrite(IO_BITMAP_A, virt_to_maddr((char *)hvm_io_bitmap + 0));
> +    __vmwrite(IO_BITMAP_B, virt_to_maddr((char *)hvm_io_bitmap + PAGE_SIZE));
> +
> +    /* MSR access bitmap. */
> +    if ( cpu_has_vmx_msr_bitmap )
> +    {
> +        unsigned long *msr_bitmap = alloc_xenheap_page();
> +        int msr_type = MSR_TYPE_R | MSR_TYPE_W;
> +
> +        if ( msr_bitmap == NULL )
> +            return -ENOMEM;
> +
> +        memset(msr_bitmap, ~0, PAGE_SIZE);
> +        v->arch.hvm_vmx.msr_bitmap = msr_bitmap;
> +        __vmwrite(MSR_BITMAP, virt_to_maddr(msr_bitmap));
> +
> +        vmx_disable_intercept_for_msr(v, MSR_FS_BASE, msr_type);
> +        vmx_disable_intercept_for_msr(v, MSR_GS_BASE, msr_type);
> +        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS, msr_type);
> +        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP, msr_type);
> +        vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP, msr_type);
> +        vmx_disable_intercept_for_msr(v, MSR_SHADOW_GS_BASE, msr_type);

So this looks like the one vmcs.c except that one has this extra:

 895         if ( cpu_has_vmx_pat && paging_mode_hap(d) )
 896             vmx_disable_intercept_for_msr(v, MSR_IA32_CR_PAT, MSR_TYPE_R | 
MSR_TYPE_W);
 897     }

Did you miss that?

> +
> +        /* pure hvm doesn't do this. safe? see: long_mode_do_msr_write() */
> +#if 0
> +        vmx_disable_intercept_for_msr(v, MSR_STAR);
> +        vmx_disable_intercept_for_msr(v, MSR_LSTAR);
> +        vmx_disable_intercept_for_msr(v, MSR_CSTAR);
> +        vmx_disable_intercept_for_msr(v, MSR_SYSCALL_MASK);
> +#endif

I would just provide a comment saying:

        /*
        * long_mode_do_msr_write() takes care of 
MSR_[STAR|LSTAR|CSTAR|SYSCALL_MASK]
        */

> +    } else {
> +        printk("PVH: CPU does NOT have msr bitmap\n");

Perhaps:

        printk(XENLOG_G_ERR "%s: ..\n", __func__);
?
> +        return -EINVAL;
> +    }
> +
> +    if ( !cpu_has_vmx_vpid ) {
> +        printk("PVH: At present VPID support is required to run PVH\n");

Should you de-allocate msr_bitmap at this point?

Or perhaps move this check (and the one below) to the start of the
function? So you have:

        if ( !cpu_has_vmx_vpid )
                gdprintk ("%s: VPID required for PVH mode.\n", __func__);

        if ( !cpu_has_vmx_secondary_exec_control )
                .. bla bla


> +        return -EINVAL;
> +    }
> +
> +    v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control;
> +
> +    if ( cpu_has_vmx_secondary_exec_control ) {
> +        v->arch.hvm_vmx.secondary_exec_control &= ~0x4FF; /* turn off all */
> +        v->arch.hvm_vmx.secondary_exec_control |= 
> +                                              
> SECONDARY_EXEC_PAUSE_LOOP_EXITING;
> +        v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_VPID;
> +
> +        v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_EPT;
> +        __vmwrite(SECONDARY_VM_EXEC_CONTROL,
> +                  v->arch.hvm_vmx.secondary_exec_control);
> +    } else {
> +        printk("PVH: NO Secondary Exec control\n");
> +        return -EINVAL;

Ditto - should you de-allocate msr_bitmap ? Or if you are going to
move the check for cpu_has_vmx_secondary_exec_control, then there is no
need for this if (.. ) else ..


> +    }
> +
> +    __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl);
> +
> +    #define VM_ENTRY_LOAD_DEBUG_CTLS 0x4
> +    #define VM_ENTRY_LOAD_EFER 0x8000
> +    vmentry_ctl &= ~VM_ENTRY_LOAD_DEBUG_CTLS;
> +    vmentry_ctl &= ~VM_ENTRY_LOAD_EFER;
> +    vmentry_ctl &= ~VM_ENTRY_SMM;
> +    vmentry_ctl &= ~VM_ENTRY_DEACT_DUAL_MONITOR;
> +    vmentry_ctl |= VM_ENTRY_IA32E_MODE;
> +    __vmwrite(VM_ENTRY_CONTROLS, vmentry_ctl);
> +

>From here on, it looks mostly the same as construct_vmcs right?

Perhaps you can add a comment saying so - so when a cleanup effort
is done later on - these can be candidates for it?

> +    /* MSR intercepts. */
> +    __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
> +    __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
> +    __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
> +
> +    /* Host data selectors. */
> +    __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
> +    __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
> +    __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS);
> +    __vmwrite(HOST_FS_SELECTOR, 0);
> +    __vmwrite(HOST_GS_SELECTOR, 0);
> +    __vmwrite(HOST_FS_BASE, 0);
> +    __vmwrite(HOST_GS_BASE, 0);
> +
> +    vmx_set_host_env(v);
> +
> +    /* Host control registers. */
> +    v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS;
> +    __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
> +    __vmwrite(HOST_CR4, mmu_cr4_features|(cpu_has_xsave ? X86_CR4_OSXSAVE : 
> 0));

That formatting looks odd. 

> +
> +    /* Host CS:RIP. */
> +    __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS);
> +    __vmwrite(HOST_RIP, (unsigned long)vmx_asm_vmexit_handler);
> +
> +    /* Host SYSENTER CS:RIP. */
> +    rdmsrl(MSR_IA32_SYSENTER_CS, sysenter_cs);
> +    __vmwrite(HOST_SYSENTER_CS, sysenter_cs);
> +    rdmsrl(MSR_IA32_SYSENTER_EIP, sysenter_eip);
> +    __vmwrite(HOST_SYSENTER_EIP, sysenter_eip);
> +
> +    __vmwrite(VM_ENTRY_INTR_INFO, 0);
> +
> +    __vmwrite(CR3_TARGET_COUNT, 0);
> +
> +    __vmwrite(GUEST_ACTIVITY_STATE, 0);
> +
> +    /* Set default guest context values here. Some of these are then 
> overwritten
> +     * in vmx_pvh_set_vcpu_info() by guest itself during vcpu bringup */
> +    __vmwrite(GUEST_CS_BASE, 0);
> +    __vmwrite(GUEST_CS_LIMIT, ~0u);
> +    __vmwrite(GUEST_CS_AR_BYTES, 0xa09b); /* CS.L == 1 */
> +    __vmwrite(GUEST_CS_SELECTOR, 0x10);

0x10. Could you use a #define for it? Somehow I thought it would
be running in FLAT_KERNEL_CS but that would be odd. And of course
since are booting in the Linux kernel without the PV MMU we would
be using its native segments. So this would correspond to
GDT_ENTRY_KERNEL_CS right? Might want to mention that
so if the Linux kernel alters its GDT page we don't blow up?

Thought I guess it does not matter - this is just the initial bootstrap
segments. Presumarily the load_gdt in the Linux kernel later on resets
it to whatever the "new" GDT is.

> +
> +    __vmwrite(GUEST_DS_BASE, 0);
> +    __vmwrite(GUEST_DS_LIMIT, ~0u);
> +    __vmwrite(GUEST_DS_AR_BYTES, 0xc093);
> +    __vmwrite(GUEST_DS_SELECTOR, 0x18);
> +
> +    __vmwrite(GUEST_SS_BASE, 0);         /* use same seg as DS */
> +    __vmwrite(GUEST_SS_LIMIT, ~0u);
> +    __vmwrite(GUEST_SS_AR_BYTES, 0xc093);
> +    __vmwrite(GUEST_SS_SELECTOR, 0x18);
> +
> +    __vmwrite(GUEST_ES_SELECTOR, 0);
> +    __vmwrite(GUEST_FS_SELECTOR, 0);
> +    __vmwrite(GUEST_GS_SELECTOR, 0);
> +
> +    /* Guest segment bases. */
> +    __vmwrite(GUEST_ES_BASE, 0);
> +    __vmwrite(GUEST_FS_BASE, 0);
> +    __vmwrite(GUEST_GS_BASE, 0);
> +
> +    /* Guest segment limits. */
> +    __vmwrite(GUEST_ES_LIMIT, ~0u);
> +    __vmwrite(GUEST_FS_LIMIT, ~0u);
> +    __vmwrite(GUEST_GS_LIMIT, ~0u);
> +
> +    /* Guest segment AR bytes. */
> +    __vmwrite(GUEST_ES_AR_BYTES, 0xc093); /* read/write, accessed */
> +    __vmwrite(GUEST_FS_AR_BYTES, 0xc093);
> +    __vmwrite(GUEST_GS_AR_BYTES, 0xc093);
> +
> +    /* Guest IDT. */
> +    __vmwrite(GUEST_GDTR_BASE, 0);
> +    __vmwrite(GUEST_GDTR_LIMIT, 0);
> +
> +    /* Guest LDT. */
> +    __vmwrite(GUEST_LDTR_AR_BYTES, 0x82); /* LDT */
> +    __vmwrite(GUEST_LDTR_SELECTOR, 0);
> +    __vmwrite(GUEST_LDTR_BASE, 0);
> +    __vmwrite(GUEST_LDTR_LIMIT, 0);
> +
> +    /* Guest TSS. */
> +    __vmwrite(GUEST_TR_AR_BYTES, 0x8b); /* 32-bit TSS (busy) */
> +    __vmwrite(GUEST_TR_BASE, 0);
> +    __vmwrite(GUEST_TR_LIMIT, 0xff);
> +
> +    __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
> +    __vmwrite(GUEST_DR7, 0);
> +    __vmwrite(VMCS_LINK_POINTER, ~0UL);
> +
> +    __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
> +    __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0);

Weird. In the vmcs.c file these are somewhat higher in the code.

> +
> +    v->arch.hvm_vmx.exception_bitmap = 
> +                                   HVM_TRAP_MASK     | (1 << TRAP_debug) | 
> +                                   (1U << TRAP_int3) | (1U << 
> TRAP_no_device);

Odd syntax.

> +    __vmwrite(EXCEPTION_BITMAP, v->arch.hvm_vmx.exception_bitmap);
> +
> +    __vmwrite(TSC_OFFSET, 0);

Hm, so you did earlier:

v->arch.hvm_vmx.exec_control &= ~CPU_BASED_USE_TSC_OFFSETING;

so is this neccessary? Or is just that you want it to be set
to default baseline state?

> +
> +    /* Set WP bit so rdonly pages are not written from CPL 0 */
> +    tmpval = X86_CR0_PG | X86_CR0_NE | X86_CR0_PE | X86_CR0_WP;
> +    __vmwrite(GUEST_CR0, tmpval);
> +    __vmwrite(CR0_READ_SHADOW, tmpval);
> +    v->arch.hvm_vcpu.hw_cr[0] = v->arch.hvm_vcpu.guest_cr[0] = tmpval;
> +
> +    tmpval = real_cr4_to_pv_guest_cr4(mmu_cr4_features);
> +    required = X86_CR4_PAE | X86_CR4_VMXE | X86_CR4_OSFXSR;
> +    if ( (tmpval & required) != required )
> +    {
> +        printk("PVH: required CR4 features not available:%lx\n", required);
> +        return -EINVAL;

You might want to move that to the top of the code. Or if you want
it here, then at least free the msr_bitmap

> +    }
> +    __vmwrite(GUEST_CR4, tmpval);
> +    __vmwrite(CR4_READ_SHADOW, tmpval);
> +    v->arch.hvm_vcpu.guest_cr[4] = tmpval;
> +
> +    __vmwrite(CR0_GUEST_HOST_MASK, ~0UL);
> +    __vmwrite(CR4_GUEST_HOST_MASK, ~0UL);
> +
> +     v->arch.hvm_vmx.vmx_realmode = 0;
> +
> +    ept->asr  = pagetable_get_pfn(p2m_get_pagetable(p2m));
> +    __vmwrite(EPT_POINTER, ept_get_eptp(ept));
> +
> +    if ( cpu_has_vmx_pat )
> +    {
> +        u64 host_pat, guest_pat;
> +
> +        rdmsrl(MSR_IA32_CR_PAT, host_pat);
> +        guest_pat = MSR_IA32_CR_PAT_RESET;
> +
> +        __vmwrite(HOST_PAT, host_pat);
> +        __vmwrite(GUEST_PAT, guest_pat);
> +    }
> +    return 0;
> +}
> +
>  static int construct_vmcs(struct vcpu *v)
>  {
>      struct domain *d = v->domain;
> @@ -825,6 +1072,12 @@ static int construct_vmcs(struct vcpu *v)
>  
>      vmx_vmcs_enter(v);
>  
> +    if ( is_pvh_vcpu(v) ) {
> +        int rc = pvh_construct_vmcs(v);
> +        vmx_vmcs_exit(v);

Do you need to call paging_update_paging_modes as construct_vmcs() does?

> +        return rc;
> +    }
> +
>      /* VMCS controls. */
>      __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
>  
> @@ -1259,8 +1512,10 @@ void vmx_do_resume(struct vcpu *v)
>  
>          vmx_clear_vmcs(v);
>          vmx_load_vmcs(v);
> -        hvm_migrate_timers(v);
> -        hvm_migrate_pirqs(v);
> +        if ( !is_pvh_vcpu(v) ) {
> +            hvm_migrate_timers(v);
> +            hvm_migrate_pirqs(v);
> +        }
>          vmx_set_host_env(v);
>          /*
>           * Both n1 VMCS and n2 VMCS need to update the host environment 
> after 
> @@ -1272,6 +1527,9 @@ void vmx_do_resume(struct vcpu *v)
>          hvm_asid_flush_vcpu(v);
>      }
>  
> +    if ( is_pvh_vcpu(v) )
> +        reset_stack_and_jump(vmx_asm_do_vmentry);
> +
>      debug_state = v->domain->debugger_attached
>                    || 
> v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_INT3]
>                    || 
> v->domain->arch.hvm_domain.params[HVM_PARAM_MEMORY_EVENT_SINGLE_STEP];
> @@ -1455,7 +1713,7 @@ static void vmcs_dump(unsigned char ch)
>  
>      for_each_domain ( d )
>      {
> -        if ( !is_hvm_domain(d) )
> +        if ( !is_hvm_or_pvh_domain(d) )
>              continue;
>          printk("\n>>> Domain %d <<<\n", d->domain_id);
>          for_each_vcpu ( d, v )
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index e64980f..194c87b 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -79,6 +79,9 @@ static int vmx_domain_initialise(struct domain *d)
>  {
>      int rc;
>  
> +    if ( is_pvh_domain(d) )
> +        return 0;
> +
>      if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
>          return rc;
>  
> @@ -87,6 +90,9 @@ static int vmx_domain_initialise(struct domain *d)
>  
>  static void vmx_domain_destroy(struct domain *d)
>  {
> +    if ( is_pvh_domain(d) )
> +        return;
> +
>      vmx_free_vlapic_mapping(d);
>  }
>  
> @@ -110,6 +116,12 @@ static int vmx_vcpu_initialise(struct vcpu *v)
>  
>      vpmu_initialise(v);
>  
> +    if (is_pvh_vcpu(v) ) 
> +    {
> +        /* this for hvm_long_mode_enabled(v) */
> +        v->arch.hvm_vcpu.guest_efer = EFER_SCE | EFER_LMA | EFER_LME;
> +        return 0;
> +    }
>      vmx_install_vlapic_mapping(v);
>  
>      /* %eax == 1 signals full real-mode support to the guest loader. */
> @@ -1033,6 +1045,23 @@ static void vmx_update_host_cr3(struct vcpu *v)
>      vmx_vmcs_exit(v);
>  }
>  
> +static void vmx_update_pvh_cr(struct vcpu *v, unsigned int cr)
> +{
> +    vmx_vmcs_enter(v);
> +    switch ( cr )
> +    {
> +        case 3:
> +            __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.guest_cr[3]);
> +            hvm_asid_flush_vcpu(v);
> +            break;
> +
> +        default:
> +            printk("PVH: d%d v%d unexpected cr%d update at rip:%lx\n", 

You are missing the XENLOG_ERR part.

> +                   v->domain->domain_id, v->vcpu_id, cr, 
> __vmread(GUEST_RIP));
> +    }
> +    vmx_vmcs_exit(v);
> +}
> +
>  void vmx_update_debug_state(struct vcpu *v)
>  {
>      unsigned long mask;
> @@ -1052,6 +1081,11 @@ void vmx_update_debug_state(struct vcpu *v)
>  
>  static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
>  {
> +    if ( is_pvh_vcpu(v) ) {
> +        vmx_update_pvh_cr(v, cr);
> +        return;
> +    }
> +
>      vmx_vmcs_enter(v);
>  
>      switch ( cr )
> -- 
> 1.7.2.3
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxx
> http://lists.xen.org/xen-devel
> 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.