[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 17/18 V2]: PVH xen: PVH dom0 creation...



On Fri, Mar 15, 2013 at 06:06:45PM -0700, Mukesh Rathor wrote:
>  Finally, the hardest. Mostly modify construct_dom0() to boot PV dom0 in
>  PVH mode. Introduce, opt_dom0pvh, which when specified in the command
>  line, causes dom0 to boot in PVH mode.
> 
> Change in V2:
>   - Map the entire IO region upfront in the P2M for PVH dom0.
> 
> Signed-off-by: Mukesh Rathor <mukesh.rathor@xxxxxxxxxx>
> ---
>  xen/arch/x86/domain_build.c |  241 
> +++++++++++++++++++++++++++++++++----------
>  xen/arch/x86/mm/hap/hap.c   |   17 +++-
>  xen/arch/x86/setup.c        |   10 ++-
>  xen/include/asm-x86/hap.h   |    1 +
>  4 files changed, 212 insertions(+), 57 deletions(-)
> 
> diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
> index 8c5b27a..72aa70b 100644
> --- a/xen/arch/x86/domain_build.c
> +++ b/xen/arch/x86/domain_build.c
> @@ -35,6 +35,8 @@
>  #include <asm/setup.h>
>  #include <asm/bzimage.h> /* for bzimage_parse */
>  #include <asm/io_apic.h>
> +#include <asm/hap.h>
> +#include <asm/debugger.h>
>  
>  #include <public/version.h>
>  
> @@ -307,6 +309,65 @@ static void __init process_dom0_ioports_disable(void)
>      }
>  }
>  
> +/* 
> + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have
> + * the entire io region mapped in the EPT/NPT.
> + */
> +static __init void  pvh_map_all_iomem(struct domain *d)
> +{
> +    unsigned long start = 0;
> +    const struct e820entry *entry;
> +    int rc, i, nump;

unsigned int for 'nump'.

Could it be called 'pfns' ?

> +
> +    for (i = 0, entry = e820.map; i < e820.nr_map; i++, entry++) {
> +        unsigned long end = entry->addr + entry->size;
> +
> +        if (entry->type == E820_RAM || i == e820.nr_map - 1) {
> +            unsigned long start_pfn = PFN_DOWN(start);
> +            unsigned long end_pfn = PFN_UP(end);
> +
> +            if (entry->type == E820_RAM)
> +                end_pfn = PFN_UP(entry->addr);
> +
> +            if (start_pfn < end_pfn) {
> +                nump = end_pfn - start_pfn + 1;
> +                rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1);

Probably want a comment for the '1' by saying: /* adding */.

> +                BUG_ON(rc);

It would help to have a bit more details.
Perhaps
                if (rc) {
                        printk(XENLOG_ERR "1-1 mapping on %lx -> %lx for %ld 
failed with %d!\n",
                                start_pfn, end_pfn, pfns, rc);
                        BUG_ON(rc);
> +            }
> +            start = end;
> +        }
> +    }
> +}
> +
> +static __init void dom0_update_physmap(struct domain *d, unsigned long pfn,
> +                                   unsigned long mfn, unsigned long 
> vphysmap_s)
> +{
> +    if ( is_pvh_domain(d) ) {
> +        int rc = guest_physmap_add_page(d, pfn, mfn, 0);

A comment about the 0 would be good. And you can collapse this in:

        BUG_ON( guest_physmap_add_page(d, pfn, mfn, 0 /* remove *) );

> +        BUG_ON(rc);
> +        return;
> +    }
> +    if ( !is_pv_32on64_domain(d) )
> +        ((unsigned long *)vphysmap_s)[pfn] = mfn;
> +    else
> +        ((unsigned int *)vphysmap_s)[pfn] = mfn;
> +
> +    set_gpfn_from_mfn(mfn, pfn);
> +}
> +
> +static __init void copy_pvh(char *dest, char *src, int bytes)
> +{
> +    /* raw_copy_to_guest() -> copy_to_user_hvm -> __hvm_copy needs curr 
> +     * to point to the hvm/pvh vcpu. Hence for PVH dom0 we can't use that.
> +     * So we just use dbg_rw_mem().
> +     */
> +    int rem = dbg_rw_mem((dbgva_t)dest, (unsigned char *)src, bytes, 0, 1, 
> 0);
> +    if (rem) {
> +        printk("PVH: Failed to copy to dom0. len:%d rem:%d\n", bytes, rem);
> +        BUG();

Wait a minute? This is debug code but you use for copying?! Why not
__copy_to_user?

> +    }
> +}
> +
>  int __init construct_dom0(
>      struct domain *d,
>      const module_t *image, unsigned long image_headroom,
> @@ -314,6 +375,7 @@ int __init construct_dom0(
>      void *(*bootstrap_map)(const module_t *),
>      char *cmdline)
>  {
> +    char *si_buf=NULL, *tmp_buf=NULL;
>      int i, cpu, rc, compatible, compat32, order, machine;
>      struct cpu_user_regs *regs;
>      unsigned long pfn, mfn;
> @@ -322,7 +384,7 @@ int __init construct_dom0(
>      unsigned long alloc_spfn;
>      unsigned long alloc_epfn;
>      unsigned long initrd_pfn = -1, initrd_mfn = 0;
> -    unsigned long count;
> +    unsigned long count, shared_info_pfn_addr = 0;
>      struct page_info *page = NULL;
>      start_info_t *si;
>      struct vcpu *v = d->vcpu[0];
> @@ -416,6 +478,13 @@ int __init construct_dom0(
>      {
>          printk("Kernel does not support Dom0 operation\n");
>          return -EINVAL;

You are adding it right above the return - so it will never reach this.

> +
> +        if ( is_pvh_domain(d) && 
> +             !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) ) 
> +        {
> +            printk("Kernel does not support PVH mode\n");
> +            return -EINVAL;
> +        }
>      }
>  
>      if ( compat32 )
> @@ -480,6 +549,12 @@ int __init construct_dom0(


So .. what about  (which is right above the vstartinfo_end)

 474     vphysmap_end     = vphysmap_start + (nr_pages * 
(!is_pv_32on64_domain(d) ?
 475                                                      sizeof(unsigned long) 
:
 476                                                      sizeof(unsigned 
int)));

That is the code the figures out how big the P2M array should be.
Shouldn't that be just
        vphysmap_end = vphysmap_start?

Or are doing some cunning work and setting the XEN_ELFNOTE_INIT_P2M to
point to some wacky address and later on in this code checking for the
base (params.p2m_base) and if so, then not creating the neccessary space for 
the P2M array?

>      vstartinfo_end   = (vstartinfo_start +
>                          sizeof(struct start_info) +
>                          sizeof(struct dom0_vga_console_info));
> +
> +    if ( is_pvh_domain(d) ) {
> +        shared_info_pfn_addr = round_pgup(vstartinfo_end) - v_start;

Huh. I don't think that is the PFN. Rather it is the physical address
offset. Perhaps it should be called 'shared_info_gaddr' ? Looking at the code
above, this means it is past the kernel, past the ramdisk, past the P2M array,
and past the 'struct start_info' and 'dom0_vga_console_info'.

Which means that in include/public/xen.h also needs this:

 *  1. The domain is started within contiguous virtual-memory region.
 *  2. The contiguous region ends on an aligned 4MB boundary.
 *  3. This the order of bootstrap elements in the initial virtual region:
 *      a. relocated kernel image
 *      b. initial ram disk              [mod_start, mod_len]
 *      c. list of allocated page frames [mfn_list, nr_pages]
 *         (unless relocated due to XEN_ELFNOTE_INIT_P2M)
 *      d. start_info_t structure        [register ESI (x86)]

==>     d1. And struct shared_info_t structure  [shared_info]
            if autotranslated guest.

 *      e. bootstrap page tables         [pt_base and CR3 (x86)]
 *      f. bootstrap stack               [register ESP (x86)]


Which looks to match what the toolstack does.


> +        vstartinfo_end   += PAGE_SIZE;
> +    }
> +
>      vpt_start        = round_pgup(vstartinfo_end);
>      for ( nr_pt_pages = 2; ; nr_pt_pages++ )
>      {
> @@ -621,16 +696,26 @@ int __init construct_dom0(
>          maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l3_page_table;
>          l3start = __va(mpt_alloc); mpt_alloc += PAGE_SIZE;
>      }
> -    clear_page(l4tab);
> -    init_guest_l4_table(l4tab, d);
> -    v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
> -    if ( is_pv_32on64_domain(d) )
> -        v->arch.guest_table_user = v->arch.guest_table;
> +    if ( is_pvh_domain(d) )
> +    {
> +        v->arch.guest_table = pagetable_from_paddr(vpt_start - v_start);

Couldn't we use mpt_alloc? It should point to the same location?
So with this code we still allocated one page for the L4:

 616         page = alloc_domheap_page(NULL, 0);

Do you think it makes sense to free it? You are not
using it - you are using the mpt_alloc as your L4. And you did
not clean it either. Perhaps also have

        clear_page(mpt_alloc);


> +        pfn = 0;

OK, so instead of starting the alloc_spfn which is the starting MFN
for the page that covers the kernel, initrd, P2M array and the three
various PAGE_SIZE structs, you start at zero.

I think this warrants a comment. I believe this is b/c it does not
matter to Xen - you are just creating a nice initial page-tables
that the guest can use and the MFNs in it - have to be PFNs as
it is running in auto-translated mode right?

In which case you could also give this extra comment:

        /* Since the guest is going to run in auto-translated
         * mode and none of the pagiging is done via PV MMU calls
         * we create the page-tables from GPFN=0 up to boostrap.
         * TODO: Instead of using 4KB L4 we could use 2MB or so
         * similar to how the P2M array can if parms.p2m_base is set
         */

which begs a question. You are doing it from PFN 0 up to d->tot_pages
and also include in this region the P2M array. Why?

> +    } else { 
> +        clear_page(l4tab);
> +        init_guest_l4_table(l4tab, d);
> +        v->arch.guest_table = pagetable_from_paddr(__pa(l4start));
> +        if ( is_pv_32on64_domain(d) )
> +            v->arch.guest_table_user = v->arch.guest_table;
> +        pfn = alloc_spfn;
> +    }
>  
>      l4tab += l4_table_offset(v_start);
> -    pfn = alloc_spfn;
>      for ( count = 0; count < ((v_end-v_start)>>PAGE_SHIFT); count++ )
>      {
> +        /* initrd chunk's mfns are separate, so we need to adjust for them */

Perhaps you can say:

        /* initrd chunk (see the 'alloc_domheap_pages' calls above) is allocated
           from a seperate MFN chunk. Hence we need to adjust for that. */


> +        signed long pvh_adj = is_pvh_domain(d) ?

Just 'long' and perhaps 'pvh_offset'? Or
'initrd_offset'?

> +                              (PFN_UP(initrd_len) - alloc_spfn)<<PAGE_SHIFT 
> : 0;

Oh gosh, what a mix. So alloc_spfn is actually the MFN of the memory allocated
for the kernel. The PFN_UP(initrd_len) gives the length of the initrd in pfns.
And you are using 'GPFN's as 'pfn' here. And the code 'if (count < initrd_pfn)'
is shortcircuited - so we will only put in 'pfn 0->d_tot_pages'

So a 50MB initrd is 12800 pfns. Wait. Why are you subtracting that value from 
the
MFN of the memory that will be used for the kernel (say it is at 0x100 MFN for 
fun)?

I am not sure I understand that. Could you explain it to me please?

> +
>          if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) )
>          {
>              maddr_to_page(mpt_alloc)->u.inuse.type_info = PGT_l1_page_table;
> @@ -657,16 +742,17 @@ int __init construct_dom0(
>                      clear_page(l3tab);
>                      if ( count == 0 )
>                          l3tab += l3_table_offset(v_start);
> -                    *l4tab = l4e_from_paddr(__pa(l3start), L4_PROT);
> +                    *l4tab = l4e_from_paddr(__pa(l3start) + pvh_adj, 
> L4_PROT);
>                      l4tab++;
>                  }
> -                *l3tab = l3e_from_paddr(__pa(l2start), L3_PROT);
> +                *l3tab = l3e_from_paddr(__pa(l2start) + pvh_adj, L3_PROT);
>                  l3tab++;
>              }
> -            *l2tab = l2e_from_paddr(__pa(l1start), L2_PROT);
> +            *l2tab = l2e_from_paddr(__pa(l1start) + pvh_adj, L2_PROT);
>              l2tab++;
>          }
> -        if ( count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
> +        if ( is_pvh_domain(d) ||
> +             count < initrd_pfn || count >= initrd_pfn + PFN_UP(initrd_len) )
>              mfn = pfn++;
>          else
>              mfn = initrd_mfn++;
> @@ -674,6 +760,9 @@ int __init construct_dom0(
>                                      L1_PROT : COMPAT_L1_PROT));
>          l1tab++;
>  
> +        if ( is_pvh_domain(d) )
> +            continue;
> +
>          page = mfn_to_page(mfn);
>          if ( (page->u.inuse.type_info == 0) &&
>               !get_page_and_type(page, d, PGT_writable_page) )
> @@ -702,6 +791,9 @@ int __init construct_dom0(
>                 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2tab));
>      }
>  
> +    if  ( is_pvh_domain(d) )
> +        goto pvh_skip_pt_rdonly;
> +
>      /* Pages that are part of page tables must be read only. */
>      l4tab = l4start + l4_table_offset(vpt_start);
>      l3start = l3tab = l4e_to_l3e(*l4tab);
> @@ -741,6 +833,8 @@ int __init construct_dom0(
>          }
>      }
>  
> +pvh_skip_pt_rdonly:
> +

Hm, so the pagetable creation for the initial page-tables virtual addresses
has been eliminated. That means the initial calculation of 'order'
(alloc_domheap_pages) and the big loop of:

484     for ( nr_pt_pages = 2; ; nr_pt_pages++ )
485     {

calculates the wrong amount right?

This needs a bit of mulling over. Since most of the Xen PV MMU code is disabled
that means (from a Linux kernel perspective), that we do bootup with

        init_level4_pgt[272]
                -> level3_ident_pgt[0]
                        level2_ident_pgt [0-511]
                                [copy from the L2 that we created above.]
        init_level4_pgt[511]
                -> level3_kernel_pgt [510]
                        -> level2_kernel_pgt [0-511]
                                [copy from the L2 that we created above.]
                

Since this is a whole-sale copy and only created page-tables from
[0 -> end of 'shared info' structure] and did create any page-table entries
for the L4, L3, L2, L1, L1, L1.... page-tables.

So any __va or __ka will work. And cleanup_highmap takes care of ripping out
any entries from _brk_limit up to max_pfn_mapped (end of ramdisk).

OK, so far so good. Wait, not it is not good - max_pfn_mapped is up to:
xen_start_info->mfn_list. That is the ramdisk.

But that means that __ka for the shared_struct and dom0_vga_console and
start_info are <p2m array + xen_start_info->mfn_list> in. And cleanup_highmap
would rip that out. Ah, we do have a bunch of revectoring:

        __va(xen_start_info->shared_info);

and
        xen_start_info = (struct start_info *)__va(__pa(xen_start_info));

in the initial boot paths. So that is OK.


OK. The xen_pagetable_init uses level2_kernel_pgt (so in Linux __ka space)
to walk and clear up the PMD entries.

And since we do not use the __ka to update the pagetables, but switch
over to using __va as fast as possible this does look to work.


But this needs to be documented. And we probably should not allocate
that extra space for L1 entries that we are never going to use.


>      /* Mask all upcalls... */
>      for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
>          shared_info(d, vcpu_info[i].evtchn_upcall_mask) = 1;
> @@ -754,6 +848,11 @@ int __init construct_dom0(
>          (void)alloc_vcpu(d, i, cpu);
>      }
>  
> +    if ( is_pvh_domain(d) )
> +    {
> +        v->arch.cr3 = v->arch.hvm_vcpu.guest_cr[3] =
> +                        (pagetable_get_pfn(v->arch.guest_table)) << 
> PAGE_SHIFT;
> +    }
>      /* Set up CR3 value for write_ptbase */
>      if ( paging_mode_enabled(d) )
>          paging_update_paging_modes(v);
> @@ -764,35 +863,16 @@ int __init construct_dom0(
>      write_ptbase(v);
>      mapcache_override_current(v);
>  
> -    /* Copy the OS image and free temporary buffer. */
> -    elf.dest = (void*)vkern_start;
> -    rc = elf_load_binary(&elf, 0);
> -    if ( rc < 0 )
> -    {
> -        printk("Failed to load the kernel binary\n");
> -        return rc;
> -    }
> -    bootstrap_map(NULL);
> -
> -    if ( UNSET_ADDR != parms.virt_hypercall )
> -    {
> -        if ( (parms.virt_hypercall < v_start) ||
> -             (parms.virt_hypercall >= v_end) )
> -        {
> -            mapcache_override_current(NULL);
> -            write_ptbase(current);
> -            printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
> -            return -1;

Wait, you just ripped out this code? Why?!

> +    /* Set up start info area. */
> +    if ( is_pvh_domain(d) ) {
> +        if ( (si_buf=xmalloc_bytes(PAGE_SIZE)) == NULL) {

Odd style.
> +            printk("PVH: xmalloc failed to alloc %ld bytes.\n", PAGE_SIZE);
> +            return -ENOMEM;
>          }
> -        hypercall_page_initialise(
> -            d, (void *)(unsigned long)parms.virt_hypercall);
> -    }
> -
> -    /* Free temporary buffers. */
> -    discard_initial_images();
> +        si = (start_info_t *)si_buf;
> +    } else
> +        si = (start_info_t *)vstartinfo_start;

Why do we allocate a page for this? Isn't the allocation for how
many pages are need for this space already taken care of in:

479     vstartinfo_start = round_pgup(vphysmap_end);
 480     vstartinfo_end   = (vstartinfo_start +
 481                         sizeof(struct start_info) +
 482                         sizeof(struct dom0_vga_console_info));
 483     vpt_start        = round_pgup(vstartinfo_end);

and then

 517     page = alloc_domheap_pages(d, order, 0);

allocates the appropiate amount?


>  
> -    /* Set up start info area. */
> -    si = (start_info_t *)vstartinfo_start;
>      clear_page(si);
>      si->nr_pages = nr_pages;
>  
> @@ -814,7 +894,7 @@ int __init construct_dom0(
>      l2tab = NULL;
>      l1tab = NULL;
>      /* Set up the phys->machine table if not part of the initial mapping. */
> -    if ( parms.p2m_base != UNSET_ADDR )
> +    if ( parms.p2m_base != UNSET_ADDR && !is_pvh_domain(d) )

AHA! I knew it! You could also do something like this at the start of the code:

        params.p2m_base = 0xdeadbeef;

Or something better? 0xdeadf00d ?

And then in here check if we use 0xdeadbeef and we skip over setting
up the page-table entries for the P2M?
>      {
>          unsigned long va = vphysmap_start;
>  
> @@ -935,6 +1015,9 @@ int __init construct_dom0(
>          unmap_domain_page(l3tab);
>      unmap_domain_page(l4start);
>  
> +    if (is_pvh_domain(d) )
> +        hap_set_pvh_alloc_for_dom0(d, nr_pages);
> +
>      /* Write the phys->machine and machine->phys table entries. */
>      for ( pfn = 0; pfn < count; pfn++ )
>      {
> @@ -951,11 +1034,8 @@ int __init construct_dom0(
>          if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) )
>              mfn = alloc_epfn - (pfn - REVERSE_START);
>  #endif
> -        if ( !is_pv_32on64_domain(d) )
> -            ((unsigned long *)vphysmap_start)[pfn] = mfn;
> -        else
> -            ((unsigned int *)vphysmap_start)[pfn] = mfn;
> -        set_gpfn_from_mfn(mfn, pfn);
> +        dom0_update_physmap(d, pfn, mfn, vphysmap_start);
> +
>          if (!(pfn & 0xfffff))
>              process_pending_softirqs();
>      }
> @@ -971,8 +1051,8 @@ int __init construct_dom0(
>              if ( !page->u.inuse.type_info &&
>                   !get_page_and_type(page, d, PGT_writable_page) )
>                  BUG();
> -            ((unsigned long *)vphysmap_start)[pfn] = mfn;
> -            set_gpfn_from_mfn(mfn, pfn);
> +            
> +            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
>              ++pfn;
>              if (!(pfn & 0xfffff))
>                  process_pending_softirqs();
> @@ -992,11 +1072,7 @@ int __init construct_dom0(
>  #ifndef NDEBUG
>  #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
>  #endif
> -            if ( !is_pv_32on64_domain(d) )
> -                ((unsigned long *)vphysmap_start)[pfn] = mfn;
> -            else
> -                ((unsigned int *)vphysmap_start)[pfn] = mfn;
> -            set_gpfn_from_mfn(mfn, pfn);
> +            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
>  #undef pfn
>              page++; pfn++;
>              if (!(pfn & 0xfffff))
> @@ -1004,6 +1080,47 @@ int __init construct_dom0(
>          }
>      }
>  
> +    /* Copy the OS image and free temporary buffer. */
> +    elf.dest = (void*)vkern_start;
> +    rc = elf_load_binary(&elf, is_pvh_domain(d) );
> +    if ( rc < 0 )
> +    {
> +        printk("Failed to load the kernel binary\n");
> +        return rc;
> +    }
> +    bootstrap_map(NULL);
> +
> +    if ( UNSET_ADDR != parms.virt_hypercall )
> +    {
> +        void *addr;
> +
> +        if ( is_pvh_domain(d) ) {
> +            if ( (tmp_buf=xzalloc_bytes(PAGE_SIZE)) == NULL ) {
> +                printk("xzalloc failed for tmp_buf. %ld bytes.\n", 
> PAGE_SIZE);
> +                return -ENOMEM;
> +            }
> +            addr = tmp_buf;
> +        } else 
> +            addr = (void *)parms.virt_hypercall;
> +
> +        if ( (parms.virt_hypercall < v_start) ||
> +             (parms.virt_hypercall >= v_end) )
> +        {
> +            write_ptbase(current);
> +            printk("Invalid HYPERCALL_PAGE field in ELF notes.\n");
> +            return -1;
> +        }
> +        hypercall_page_initialise(d, addr);
> +
> +        if ( is_pvh_domain(d) ) {
> +            copy_pvh((void *)parms.virt_hypercall, tmp_buf, PAGE_SIZE);
> +            xfree(tmp_buf);
> +        }
> +    }

Wait? Why the move of the code here?

> +
> +    /* Free temporary buffers. */
> +    discard_initial_images();
> +
>      if ( initrd_len != 0 )
>      {
>          si->mod_start = vinitrd_start ?: initrd_pfn;
> @@ -1019,6 +1136,15 @@ int __init construct_dom0(
>          si->console.dom0.info_off  = sizeof(struct start_info);
>          si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
>      }
> +    if ( is_pvh_domain(d) ) {
> +        unsigned long mfn = virt_to_mfn(d->shared_info);
> +        unsigned long pfn = shared_info_pfn_addr>>PAGE_SHIFT;
> +        si->shared_info = shared_info_pfn_addr;

So that is not an PFN. That is the guest physical address I reckon?

Ah, which matches (kind of) the comment:

725     unsigned long shared_info;  /* MACHINE address of shared info struct. */


> +        dom0_update_physmap(d, pfn, mfn, 0);

, 0 /* ignored. */

> +
> +        copy_pvh((char *)vstartinfo_start, si_buf, PAGE_SIZE);
> +        xfree(si_buf);

Oh. Why the copy? Why not just operate directly on the vstartinfo_start virtual
address?


> +    }
>  
>      if ( is_pv_32on64_domain(d) )
>          xlat_start_info(si, XLAT_start_info_console_dom0);
> @@ -1050,12 +1176,16 @@ int __init construct_dom0(
>      regs->eip = parms.virt_entry;
>      regs->esp = vstack_end;
>      regs->esi = vstartinfo_start;
> -    regs->eflags = X86_EFLAGS_IF;
> +    regs->eflags = X86_EFLAGS_IF | 0x2;

Ahem!

>  
> -    if ( opt_dom0_shadow )
> +    if ( opt_dom0_shadow ) {
> +        if ( is_pvh_domain(d) ) {
> +            printk("Invalid option dom0_shadow for PVH\n");
> +            return -EINVAL;
> +        }
>          if ( paging_enable(d, PG_SH_enable) == 0 ) 
>              paging_update_paging_modes(v);
> -
> +    }
>      if ( supervisor_mode_kernel )
>      {
>          v->arch.pv_vcpu.kernel_ss &= ~3;
> @@ -1132,6 +1262,9 @@ int __init construct_dom0(
>  
>      BUG_ON(rc != 0);
>  
> +    if ( is_pvh_domain(d) )
> +        pvh_map_all_iomem(d);
> +
>      iommu_dom0_init(dom0);
>  
>      return 0;
> diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
> index 055833d..d3d5697 100644
> --- a/xen/arch/x86/mm/hap/hap.c
> +++ b/xen/arch/x86/mm/hap/hap.c
> @@ -574,6 +574,20 @@ int hap_domctl(struct domain *d, xen_domctl_shadow_op_t 
> *sc,
>      }
>  }
>  
> +/* Resize hap table. Copied from: libxl_get_required_shadow_memory() */
> +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages)
> +{
> +    int rc;
> +    unsigned long memkb = num_pages * (PAGE_SIZE / 1024);
> +
> +    memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024));

Could you include an explanation behind this formula please?

> +    num_pages = ((memkb+1023)/1024) << (20 - PAGE_SHIFT);

num_pages = memkb >> 2 ?

> +    paging_lock(d);
> +    rc = hap_set_allocation(d, num_pages, NULL);
> +    paging_unlock(d);
> +    BUG_ON(rc);
> +}
> +
>  static const struct paging_mode hap_paging_real_mode;
>  static const struct paging_mode hap_paging_protected_mode;
>  static const struct paging_mode hap_paging_pae_mode;
> @@ -633,7 +647,8 @@ static void hap_update_cr3(struct vcpu *v, int do_locking)
>  const struct paging_mode *
>  hap_paging_get_mode(struct vcpu *v)
>  {
> -    return !hvm_paging_enabled(v)   ? &hap_paging_real_mode :
> +    return is_pvh_vcpu(v) ? &hap_paging_long_mode :
> +        !hvm_paging_enabled(v)   ? &hap_paging_real_mode :
>          hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
>          hvm_pae_enabled(v)       ? &hap_paging_pae_mode  :
>                                     &hap_paging_protected_mode;
> diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
> index 43301a5..f307f24 100644
> --- a/xen/arch/x86/setup.c
> +++ b/xen/arch/x86/setup.c
> @@ -60,6 +60,10 @@ integer_param("maxcpus", max_cpus);
>  static bool_t __initdata disable_smep;
>  invbool_param("smep", disable_smep);
>  
> +/* Boot dom0 in PVH mode */
> +static bool_t __initdata opt_dom0pvh;
> +boolean_param("dom0pvh", opt_dom0pvh);
> +
>  /* **** Linux config option: propagated to domain0. */
>  /* "acpi=off":    Sisables both ACPI table parsing and interpreter. */
>  /* "acpi=force":  Override the disable blacklist.                   */
> @@ -545,7 +549,7 @@ void __init __start_xen(unsigned long mbi_p)
>  {
>      char *memmap_type = NULL;
>      char *cmdline, *kextra, *loader;
> -    unsigned int initrdidx;
> +    unsigned int initrdidx, domcr_flags = 0;
>      multiboot_info_t *mbi = __va(mbi_p);
>      module_t *mod = (module_t *)__va(mbi->mods_addr);
>      unsigned long nr_pages, modules_headroom, *module_map;
> @@ -1314,7 +1318,9 @@ void __init __start_xen(unsigned long mbi_p)
>          panic("Could not protect TXT memory regions\n");
>  
>      /* Create initial domain 0. */
> -    dom0 = domain_create(0, DOMCRF_s3_integrity, 0);
> +    domcr_flags = (opt_dom0pvh ? DOMCRF_pvh | DOMCRF_hap : 0);
> +    domcr_flags |= DOMCRF_s3_integrity;
> +    dom0 = domain_create(0, domcr_flags, 0);
>      if ( IS_ERR(dom0) || (alloc_dom0_vcpu0() == NULL) )
>          panic("Error creating domain 0\n");
>  
> diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h
> index e03f983..aab8558 100644
> --- a/xen/include/asm-x86/hap.h
> +++ b/xen/include/asm-x86/hap.h
> @@ -63,6 +63,7 @@ int   hap_track_dirty_vram(struct domain *d,
>                             XEN_GUEST_HANDLE_64(uint8) dirty_bitmap);
>  
>  extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
> +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages);
>  
>  #endif /* XEN_HAP_H */
>  
> -- 
> 1.7.2.3
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxx
> http://lists.xen.org/xen-devel
> 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.