[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [V4 PATCH 2/7] dom0: construct_dom0 changes



>>> On 03.12.13 at 03:30, Mukesh Rathor <mukesh.rathor@xxxxxxxxxx> wrote:
> This patch changes construct_dom0 to boot in PVH mode. Changes
> need to support it are also included here.
> 
> Signed-off-by: Mukesh Rathor <mukesh.rathor@xxxxxxxxxx>

Reviewed-by: Jan Beulich <jbeulich@xxxxxxxx>

> ---
>  xen/arch/x86/domain_build.c |  235 +++++++++++++++++++++++++++++++++++++++---
>  xen/arch/x86/mm/hap/hap.c   |   15 +++
>  xen/include/asm-x86/hap.h   |    1 +
>  3 files changed, 234 insertions(+), 17 deletions(-)
> 
> diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
> index 67a569a..eb00c0d 100644
> --- a/xen/arch/x86/domain_build.c
> +++ b/xen/arch/x86/domain_build.c
> @@ -35,6 +35,7 @@
>  #include <asm/setup.h>
>  #include <asm/bzimage.h> /* for bzimage_parse */
>  #include <asm/io_apic.h>
> +#include <asm/hap.h>
>  
>  #include <public/version.h>
>  
> @@ -307,6 +308,151 @@ static void __init process_dom0_ioports_disable(void)
>      }
>  }
>  
> +static __init void pvh_add_mem_mapping(struct domain *d, unsigned long gfn,
> +                                       unsigned long mfn, unsigned long 
> nr_mfns)
> +{
> +    unsigned long i;
> +    for ( i = 0; i < nr_mfns; i++ )
> +        if ( !set_mmio_p2m_entry(d, gfn + i, _mfn(mfn + i)) )
> +            panic("Failed setting p2m. gfn:%lx mfn:%lx i:%ld\n", gfn, mfn, 
> i);
> +}
> +
> +/*
> + * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have
> + * the entire io region mapped in the EPT/NPT.
> + *
> + * pvh fixme: The following doesn't map MMIO ranges when they sit above the
> + *            highest E820 covered address.
> + */
> +static __init void pvh_map_all_iomem(struct domain *d)
> +{
> +    unsigned long start_pfn, end_pfn, end = 0, start = 0;
> +    const struct e820entry *entry;
> +    unsigned int i, nump;
> +
> +    for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ )
> +    {
> +        end = entry->addr + entry->size;
> +
> +        if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE ||
> +             i == e820.nr_map - 1 )
> +        {
> +            start_pfn = PFN_DOWN(start);
> +
> +            /* Unused RAM areas are marked UNUSABLE, so skip it too */
> +            if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE )
> +                end_pfn = PFN_UP(entry->addr);
> +            else
> +                end_pfn = PFN_UP(end);
> +
> +            if ( start_pfn < end_pfn )
> +            {
> +                nump = end_pfn - start_pfn;
> +                /* Add pages to the mapping */
> +                pvh_add_mem_mapping(d, start_pfn, start_pfn, nump);
> +            }
> +            start = end;
> +        }
> +    }
> +
> +    /* If the e820 ended under 4GB, we must map the remaining space upto 
> 4GB */
> +    if ( end < GB(4) )
> +    {
> +        start_pfn = PFN_UP(end);
> +        end_pfn = (GB(4)) >> PAGE_SHIFT;
> +        nump = end_pfn - start_pfn;
> +        pvh_add_mem_mapping(d, start_pfn, start_pfn, nump);
> +    }
> +}
> +
> +static __init void dom0_update_physmap(struct domain *d, unsigned long pfn,
> +                                   unsigned long mfn, unsigned long 
> vphysmap_s)
> +{
> +    if ( is_pvh_domain(d) )
> +    {
> +        int rc = guest_physmap_add_page(d, pfn, mfn, 0);
> +        BUG_ON(rc);
> +        return;
> +    }
> +    if ( !is_pv_32on64_domain(d) )
> +        ((unsigned long *)vphysmap_s)[pfn] = mfn;
> +    else
> +        ((unsigned int *)vphysmap_s)[pfn] = mfn;
> +
> +    set_gpfn_from_mfn(mfn, pfn);
> +}
> +
> +static __init void pvh_fixup_page_tables_for_hap(struct vcpu *v,
> +                                                 unsigned long v_start,
> +                                                 unsigned long v_end)
> +{
> +    int i, j, k;
> +    l4_pgentry_t *pl4e, *l4start;
> +    l3_pgentry_t *pl3e;
> +    l2_pgentry_t *pl2e;
> +    l1_pgentry_t *pl1e;
> +    unsigned long cr3_pfn;
> +
> +    ASSERT(paging_mode_enabled(v->domain));
> +
> +    l4start = map_domain_page(pagetable_get_pfn(v->arch.guest_table));
> +
> +    /* Clear entries prior to guest L4 start */
> +    pl4e = l4start + l4_table_offset(v_start);
> +    memset(l4start, 0, (unsigned long)pl4e - (unsigned long)l4start);
> +
> +    for ( ; pl4e <= l4start + l4_table_offset(v_end - 1); pl4e++ )
> +    {
> +        pl3e = map_l3t_from_l4e(*pl4e);
> +        for ( i = 0; i < PAGE_SIZE / sizeof(*pl3e); i++, pl3e++ )
> +        {
> +            if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
> +                continue;
> +
> +            pl2e = map_l2t_from_l3e(*pl3e);
> +            for ( j = 0; j < PAGE_SIZE / sizeof(*pl2e); j++, pl2e++ )
> +            {
> +                if ( !(l2e_get_flags(*pl2e)  & _PAGE_PRESENT) )
> +                    continue;
> +
> +                pl1e = map_l1t_from_l2e(*pl2e);
> +                for ( k = 0; k < PAGE_SIZE / sizeof(*pl1e); k++, pl1e++ )
> +                {
> +                    if ( !(l1e_get_flags(*pl1e) & _PAGE_PRESENT) )
> +                        continue;
> +
> +                    *pl1e = 
> l1e_from_pfn(get_gpfn_from_mfn(l1e_get_pfn(*pl1e)),
> +                                         l1e_get_flags(*pl1e));
> +                }
> +                unmap_domain_page(pl1e);
> +                *pl2e = l2e_from_pfn(get_gpfn_from_mfn(l2e_get_pfn(*pl2e)),
> +                                     l2e_get_flags(*pl2e));
> +            }
> +            unmap_domain_page(pl2e);
> +            *pl3e = l3e_from_pfn(get_gpfn_from_mfn(l3e_get_pfn(*pl3e)),
> +                                 l3e_get_flags(*pl3e));
> +        }
> +        unmap_domain_page(pl3e);
> +        *pl4e = l4e_from_pfn(get_gpfn_from_mfn(l4e_get_pfn(*pl4e)),
> +                             l4e_get_flags(*pl4e));
> +    }
> +
> +    /* Clear entries post guest L4. */
> +    if ( (unsigned long)pl4e & (PAGE_SIZE - 1) )
> +        memset(pl4e, 0, PAGE_SIZE - ((unsigned long)pl4e & (PAGE_SIZE - 1)));
> +
> +    unmap_domain_page(l4start);
> +
> +    cr3_pfn = get_gpfn_from_mfn(paddr_to_pfn(v->arch.cr3));
> +    v->arch.hvm_vcpu.guest_cr[3] = pfn_to_paddr(cr3_pfn);
> +
> +    /*
> +     * Finally, we update the paging modes (hap_update_paging_modes). This 
> will
> +     * create monitor_table for us, update v->arch.cr3, and update vmcs.cr3.
> +     */
> +    paging_update_paging_modes(v);
> +}
> +
>  static __init void mark_pv_pt_pages_rdonly(struct domain *d,
>                                             l4_pgentry_t *l4start,
>                                             unsigned long vpt_start,
> @@ -516,6 +662,8 @@ int __init construct_dom0(
>      l3_pgentry_t *l3tab = NULL, *l3start = NULL;
>      l2_pgentry_t *l2tab = NULL, *l2start = NULL;
>      l1_pgentry_t *l1tab = NULL, *l1start = NULL;
> +    paddr_t shared_info_paddr = 0;
> +    u32 save_pvh_pg_mode = 0;
>  
>      /*
>       * This fully describes the memory layout of the initial domain. All 
> @@ -593,12 +741,21 @@ int __init construct_dom0(
>          goto out;
>      }
>  
> -    if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != 
> XEN_ENT_NONE &&
> -         !test_bit(XENFEAT_dom0, parms.f_supported) )
> +    if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != 
> XEN_ENT_NONE )
>      {
> -        printk("Kernel does not support Dom0 operation\n");
> -        rc = -EINVAL;
> -        goto out;
> +        if ( !test_bit(XENFEAT_dom0, parms.f_supported) )
> +        {
> +            printk("Kernel does not support Dom0 operation\n");
> +            rc = -EINVAL;
> +            goto out;
> +        }
> +        if ( is_pvh_domain(d) &&
> +             !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) )
> +        {
> +            printk("Kernel does not support PVH mode\n");
> +            rc = -EINVAL;
> +            goto out;
> +        }
>      }
>  
>      if ( compat32 )
> @@ -663,6 +820,13 @@ int __init construct_dom0(
>      vstartinfo_end   = (vstartinfo_start +
>                          sizeof(struct start_info) +
>                          sizeof(struct dom0_vga_console_info));
> +
> +    if ( is_pvh_domain(d) )
> +    {
> +        shared_info_paddr = round_pgup(vstartinfo_end) - v_start;
> +        vstartinfo_end   += PAGE_SIZE;
> +    }
> +
>      vpt_start        = round_pgup(vstartinfo_end);
>      for ( nr_pt_pages = 2; ; nr_pt_pages++ )
>      {
> @@ -903,6 +1067,13 @@ int __init construct_dom0(
>          (void)alloc_vcpu(d, i, cpu);
>      }
>  
> +    /*
> +     * pvh: we temporarily disable paging mode so that we can build cr3 
> needed
> +     * to run on dom0's page tables.
> +     */
> +    save_pvh_pg_mode = d->arch.paging.mode;
> +    d->arch.paging.mode = 0;
> +
>      /* Set up CR3 value for write_ptbase */
>      if ( paging_mode_enabled(d) )
>          paging_update_paging_modes(v);
> @@ -969,6 +1140,15 @@ int __init construct_dom0(
>                           nr_pages);
>      }
>  
> +    if ( is_pvh_domain(d) )
> +        hap_set_pvh_alloc_for_dom0(d, nr_pages);
> +
> +    /*
> +     * We enable paging mode again so guest_physmap_add_page will do the
> +     * right thing for us.
> +     */
> +    d->arch.paging.mode = save_pvh_pg_mode;
> +
>      /* Write the phys->machine and machine->phys table entries. */
>      for ( pfn = 0; pfn < count; pfn++ )
>      {
> @@ -985,11 +1165,7 @@ int __init construct_dom0(
>          if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) )
>              mfn = alloc_epfn - (pfn - REVERSE_START);
>  #endif
> -        if ( !is_pv_32on64_domain(d) )
> -            ((unsigned long *)vphysmap_start)[pfn] = mfn;
> -        else
> -            ((unsigned int *)vphysmap_start)[pfn] = mfn;
> -        set_gpfn_from_mfn(mfn, pfn);
> +        dom0_update_physmap(d, pfn, mfn, vphysmap_start);
>          if (!(pfn & 0xfffff))
>              process_pending_softirqs();
>      }
> @@ -1005,8 +1181,8 @@ int __init construct_dom0(
>              if ( !page->u.inuse.type_info &&
>                   !get_page_and_type(page, d, PGT_writable_page) )
>                  BUG();
> -            ((unsigned long *)vphysmap_start)[pfn] = mfn;
> -            set_gpfn_from_mfn(mfn, pfn);
> +
> +            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
>              ++pfn;
>              if (!(pfn & 0xfffff))
>                  process_pending_softirqs();
> @@ -1026,11 +1202,7 @@ int __init construct_dom0(
>  #ifndef NDEBUG
>  #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
>  #endif
> -            if ( !is_pv_32on64_domain(d) )
> -                ((unsigned long *)vphysmap_start)[pfn] = mfn;
> -            else
> -                ((unsigned int *)vphysmap_start)[pfn] = mfn;
> -            set_gpfn_from_mfn(mfn, pfn);
> +            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
>  #undef pfn
>              page++; pfn++;
>              if (!(pfn & 0xfffff))
> @@ -1054,6 +1226,15 @@ int __init construct_dom0(
>          si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
>      }
>  
> +    /*
> +     * PVH: We need to update si->shared_info while we are on dom0 page 
> tables,
> +     * but need to defer the p2m update until after we have fixed up the
> +     * page tables for PVH so that the m2p for the si pte entry returns
> +     * correct pfn.
> +     */
> +    if ( is_pvh_domain(d) )
> +        si->shared_info = shared_info_paddr;
> +
>      if ( is_pv_32on64_domain(d) )
>          xlat_start_info(si, XLAT_start_info_console_dom0);
>  
> @@ -1087,8 +1268,15 @@ int __init construct_dom0(
>      regs->eflags = X86_EFLAGS_IF;
>  
>      if ( opt_dom0_shadow )
> +    {
> +        if ( is_pvh_domain(d) )
> +        {
> +            printk("Unsupported option dom0_shadow for PVH\n");
> +            return -EINVAL;
> +        }
>          if ( paging_enable(d, PG_SH_enable) == 0 ) 
>              paging_update_paging_modes(v);
> +    }
>  
>      if ( supervisor_mode_kernel )
>      {
> @@ -1178,6 +1366,19 @@ int __init construct_dom0(
>          printk(" Xen warning: dom0 kernel broken ELF: %s\n",
>                 elf_check_broken(&elf));
>  
> +    if ( is_pvh_domain(d) )
> +    {
> +        /* finally, fixup the page table, replacing mfns with pfns */
> +        pvh_fixup_page_tables_for_hap(v, v_start, v_end);
> +
> +        /* the pt has correct pfn for si, now update the mfn in the p2m */
> +        mfn = virt_to_mfn(d->shared_info);
> +        pfn = shared_info_paddr >> PAGE_SHIFT;
> +        dom0_update_physmap(d, pfn, mfn, 0);
> +
> +        pvh_map_all_iomem(d);
> +    }
> +
>      iommu_dom0_init(dom0);
>      return 0;
>  
> diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
> index d3f64bd..cc3ba66 100644
> --- a/xen/arch/x86/mm/hap/hap.c
> +++ b/xen/arch/x86/mm/hap/hap.c
> @@ -579,6 +579,21 @@ int hap_domctl(struct domain *d, xen_domctl_shadow_op_t 
> *sc,
>      }
>  }
>  
> +void __init hap_set_pvh_alloc_for_dom0(struct domain *d,
> +                                       unsigned long num_pages)
> +{
> +    int rc;
> +    unsigned long memkb = num_pages * (PAGE_SIZE / 1024);
> +
> +    /* Copied from: libxl_get_required_shadow_memory() */
> +    memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024));
> +    num_pages = ( (memkb + 1023) / 1024) << (20 - PAGE_SHIFT);
> +    paging_lock(d);
> +    rc = hap_set_allocation(d, num_pages, NULL);
> +    paging_unlock(d);
> +    BUG_ON(rc);
> +}
> +
>  static const struct paging_mode hap_paging_real_mode;
>  static const struct paging_mode hap_paging_protected_mode;
>  static const struct paging_mode hap_paging_pae_mode;
> diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h
> index e03f983..aab8558 100644
> --- a/xen/include/asm-x86/hap.h
> +++ b/xen/include/asm-x86/hap.h
> @@ -63,6 +63,7 @@ int   hap_track_dirty_vram(struct domain *d,
>                             XEN_GUEST_HANDLE_64(uint8) dirty_bitmap);
>  
>  extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
> +void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages);
>  
>  #endif /* XEN_HAP_H */
>  
> -- 
> 1.7.2.3



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.