[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [RFC 0 PATCH 3/3] PVH dom0: construct_dom0 changes



This patch changes construct_dom0 to boot in PVH mode. Changes
need to support it are also included here.

Signed-off-by: Mukesh Rathor <mukesh.rathor@xxxxxxxxxx>
---
 xen/arch/x86/domain_build.c |  231 +++++++++++++++++++++++++++++++++++++++----
 xen/arch/x86/domctl.c       |    2 +-
 xen/arch/x86/mm/hap/hap.c   |   14 +++
 xen/include/asm-x86/hap.h   |    1 +
 xen/include/xen/domain.h    |    3 +
 5 files changed, 231 insertions(+), 20 deletions(-)

diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index 5125aa2..3fd2b6c 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -35,6 +35,7 @@
 #include <asm/setup.h>
 #include <asm/bzimage.h> /* for bzimage_parse */
 #include <asm/io_apic.h>
+#include <asm/hap.h>
 
 #include <public/version.h>
 
@@ -307,6 +308,136 @@ static void __init process_dom0_ioports_disable(void)
     }
 }
 
+/*
+ * Set the 1:1 map for all non-RAM regions for dom 0. Thus, dom0 will have
+ * the entire io region mapped in the EPT/NPT.
+ *
+ * PVH FIXME: The following doesn't map MMIO ranges when they sit above the
+ *            highest E820 covered address.
+ */
+static __init void pvh_map_all_iomem(struct domain *d)
+{
+    unsigned long start_pfn, end_pfn, end = 0, start = 0;
+    const struct e820entry *entry;
+    unsigned int i, nump;
+    int rc;
+
+    for ( i = 0, entry = e820.map; i < e820.nr_map; i++, entry++ )
+    {
+        end = entry->addr + entry->size;
+
+        if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE ||
+             i == e820.nr_map - 1 )
+        {
+            start_pfn = PFN_DOWN(start);
+            end_pfn = PFN_UP(end);
+
+            if ( entry->type == E820_RAM || entry->type == E820_UNUSABLE )
+                end_pfn = PFN_UP(entry->addr);
+
+            if ( start_pfn < end_pfn )
+            {
+                nump = end_pfn - start_pfn;
+                /* Add pages to the mapping */
+                rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1);
+                BUG_ON(rc);
+            }
+            start = end;
+        }
+    }
+
+    /* If the e820 ended under 4GB, we must map the remaining space upto 4GB */
+    if ( end < GB(4) )
+    {
+        start_pfn = PFN_UP(end);
+        end_pfn = (GB(4)) >> PAGE_SHIFT;
+        nump = end_pfn - start_pfn;
+        rc = domctl_memory_mapping(d, start_pfn, start_pfn, nump, 1);
+        BUG_ON(rc);
+    }
+}
+
+static __init void dom0_update_physmap(struct domain *d, unsigned long pfn,
+                                   unsigned long mfn, unsigned long vphysmap_s)
+{
+    if ( is_pvh_domain(d) )
+    {
+        int rc = guest_physmap_add_page(d, pfn, mfn, 0);
+        BUG_ON(rc);
+        return;
+    }
+    if ( !is_pv_32on64_domain(d) )
+        ((unsigned long *)vphysmap_s)[pfn] = mfn;
+    else
+        ((unsigned int *)vphysmap_s)[pfn] = mfn;
+
+    set_gpfn_from_mfn(mfn, pfn);
+}
+
+static __init void pvh_fixup_page_tables_for_hap(struct vcpu *v,
+                                                 unsigned long v_start)
+{
+    int i, j, k;
+    l4_pgentry_t *l4tab = NULL, *l4start = NULL;
+    l3_pgentry_t *l3tab = NULL;
+    l2_pgentry_t *l2tab = NULL;
+    l1_pgentry_t *l1tab = NULL;
+    l4_pgentry_t sav_guest_l4;
+    unsigned long cr3_pfn;
+
+    ASSERT(paging_mode_enabled(v->domain));
+
+    l4start = map_domain_page(pagetable_get_pfn(v->arch.guest_table));
+    l4tab = l4start + l4_table_offset(v_start);
+    sav_guest_l4 = *l4tab;
+
+    /* Give guest a clean slate to start with */
+    clear_page(l4start);
+    *l4tab = sav_guest_l4;
+    BUG_ON(!l4e_get_pfn(sav_guest_l4));
+
+    l3tab = map_l3t_from_l4e(*l4tab);
+    for (i=0; i < PAGE_SIZE / sizeof(l3_pgentry_t); i++, l3tab++)
+    {
+        if ( l3e_get_pfn(*l3tab) == 0 )
+            continue;
+
+        l2tab = map_l2t_from_l3e(*l3tab);
+        for (j=0; j < PAGE_SIZE / sizeof(l2_pgentry_t); j++, l2tab++)
+        {
+            if ( l2e_get_pfn(*l2tab) == 0 )
+                continue;
+
+            l1tab = map_l1t_from_l2e(*l2tab);
+
+            for (k=0; k < PAGE_SIZE / sizeof(l2_pgentry_t); k++, l1tab++)
+            {
+                if ( l1e_get_pfn(*l1tab) == 0 )
+                    continue;
+
+                *l1tab = l1e_from_pfn(get_gpfn_from_mfn(l1e_get_pfn(*l1tab)),
+                                      l1e_get_flags(*l1tab));
+            }
+            *l2tab = l2e_from_pfn(get_gpfn_from_mfn(l2e_get_pfn(*l2tab)),
+                                  l2e_get_flags(*l2tab));
+        }
+        *l3tab = l3e_from_pfn(get_gpfn_from_mfn(l3e_get_pfn(*l3tab)),
+                              l3e_get_flags(*l3tab));
+    }
+    *l4tab = l4e_from_pfn(get_gpfn_from_mfn(l4e_get_pfn(*l4tab)),
+                          l4e_get_flags(*l4tab));
+
+    cr3_pfn = get_gpfn_from_mfn(paddr_to_pfn(v->arch.cr3));
+    v->arch.hvm_vcpu.guest_cr[3] = pfn_to_paddr(cr3_pfn);
+
+    /*
+     * now we update the paging modes (hap_update_paging_modes). This will
+     * create monitor_table for us, update v->arch.cr3, and vmcs.cr3.
+     */
+    paging_update_paging_modes(v);
+
+}
+
 static __init void mark_pv_pt_pages_rdonly(struct domain *d,
                                            l4_pgentry_t *l4start,
                                            unsigned long vpt_start,
@@ -513,7 +644,7 @@ int __init construct_dom0(
     unsigned long alloc_spfn;
     unsigned long alloc_epfn;
     unsigned long initrd_pfn = -1, initrd_mfn = 0;
-    unsigned long count;
+    unsigned long count, shared_info_paddr = 0;
     struct page_info *page = NULL;
     start_info_t *si;
     struct vcpu *v = d->vcpu[0];
@@ -526,6 +657,7 @@ int __init construct_dom0(
     l3_pgentry_t *l3tab = NULL, *l3start = NULL;
     l2_pgentry_t *l2tab = NULL, *l2start = NULL;
     l1_pgentry_t *l1tab = NULL, *l1start = NULL;
+    u32 save_pvh_pg_mode = 0;
 
     /*
      * This fully describes the memory layout of the initial domain. All 
@@ -603,12 +735,20 @@ int __init construct_dom0(
         goto out;
     }
 
-    if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE 
&&
-         !test_bit(XENFEAT_dom0, parms.f_supported) )
+    if ( parms.elf_notes[XEN_ELFNOTE_SUPPORTED_FEATURES].type != XEN_ENT_NONE )
     {
-        printk("Kernel does not support Dom0 operation\n");
-        rc = -EINVAL;
-        goto out;
+        if ( !test_bit(XENFEAT_dom0, parms.f_supported) )
+        {
+            printk("Kernel does not support Dom0 operation\n");
+            rc = -EINVAL;
+            goto out;
+        }
+        if ( is_pvh_domain(d) &&
+             !test_bit(XENFEAT_hvm_callback_vector, parms.f_supported) )
+        {
+            printk("Kernel does not support PVH mode\n");
+            return -EINVAL;
+        }
     }
 
     if ( compat32 )
@@ -673,6 +813,14 @@ int __init construct_dom0(
     vstartinfo_end   = (vstartinfo_start +
                         sizeof(struct start_info) +
                         sizeof(struct dom0_vga_console_info));
+
+    if ( is_pvh_domain(d) )
+    {
+        /* note, following is paddr and not maddr */
+        shared_info_paddr = round_pgup(vstartinfo_end) - v_start;
+        vstartinfo_end   += PAGE_SIZE;
+    }
+
     vpt_start        = round_pgup(vstartinfo_end);
     for ( nr_pt_pages = 2; ; nr_pt_pages++ )
     {
@@ -868,6 +1016,9 @@ int __init construct_dom0(
                                     L1_PROT : COMPAT_L1_PROT));
         l1tab++;
 
+        if ( is_pvh_domain(d) )
+            continue;
+
         page = mfn_to_page(mfn);
         if ( (page->u.inuse.type_info == 0) &&
              !get_page_and_type(page, d, PGT_writable_page) )
@@ -912,6 +1063,16 @@ int __init construct_dom0(
         (void)alloc_vcpu(d, i, cpu);
     }
 
+    /*
+     * pvh: we temporarily disable paging mode so that we can build cr3 needed
+     * to run on dom0's page tables.
+     */
+    if ( is_pvh_domain(d) )
+    {
+        save_pvh_pg_mode = d->arch.paging.mode;
+        d->arch.paging.mode = 0;
+    }
+
     /* Set up CR3 value for write_ptbase */
     if ( paging_mode_enabled(d) )
         paging_update_paging_modes(v);
@@ -974,6 +1135,17 @@ int __init construct_dom0(
         setup_pv_p2m_table(d, v, &parms, v_start, vphysmap_start,
                            vphysmap_end, v_end, nr_pages);
 
+    if ( is_pvh_domain(d) )
+    {
+        hap_set_pvh_alloc_for_dom0(d, nr_pages);
+
+        /*
+         * We enable paging mode again so guest_physmap_add_page will do the
+         * right thing for us.
+         */
+        d->arch.paging.mode = save_pvh_pg_mode;
+    }
+
     /* Write the phys->machine and machine->phys table entries. */
     for ( pfn = 0; pfn < count; pfn++ )
     {
@@ -990,11 +1162,7 @@ int __init construct_dom0(
         if ( pfn > REVERSE_START && (vinitrd_start || pfn < initrd_pfn) )
             mfn = alloc_epfn - (pfn - REVERSE_START);
 #endif
-        if ( !is_pv_32on64_domain(d) )
-            ((unsigned long *)vphysmap_start)[pfn] = mfn;
-        else
-            ((unsigned int *)vphysmap_start)[pfn] = mfn;
-        set_gpfn_from_mfn(mfn, pfn);
+        dom0_update_physmap(d, pfn, mfn, vphysmap_start);
         if (!(pfn & 0xfffff))
             process_pending_softirqs();
     }
@@ -1010,8 +1178,8 @@ int __init construct_dom0(
             if ( !page->u.inuse.type_info &&
                  !get_page_and_type(page, d, PGT_writable_page) )
                 BUG();
-            ((unsigned long *)vphysmap_start)[pfn] = mfn;
-            set_gpfn_from_mfn(mfn, pfn);
+
+            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
             ++pfn;
             if (!(pfn & 0xfffff))
                 process_pending_softirqs();
@@ -1031,11 +1199,7 @@ int __init construct_dom0(
 #ifndef NDEBUG
 #define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn)))
 #endif
-            if ( !is_pv_32on64_domain(d) )
-                ((unsigned long *)vphysmap_start)[pfn] = mfn;
-            else
-                ((unsigned int *)vphysmap_start)[pfn] = mfn;
-            set_gpfn_from_mfn(mfn, pfn);
+            dom0_update_physmap(d, pfn, mfn, vphysmap_start);
 #undef pfn
             page++; pfn++;
             if (!(pfn & 0xfffff))
@@ -1059,6 +1223,15 @@ int __init construct_dom0(
         si->console.dom0.info_size = sizeof(struct dom0_vga_console_info);
     }
 
+    /*
+     * PVH: We need to update si->shared_info while we are on dom0 page tables,
+     * but need to defer the p2m update until after we have fixed up the
+     * page tables for PVH so that the m2p for the si pte entry returns
+     * correct pfn.
+     */
+    if ( is_pvh_domain(d) )
+        si->shared_info = shared_info_paddr;
+
     if ( is_pv_32on64_domain(d) )
         xlat_start_info(si, XLAT_start_info_console_dom0);
 
@@ -1089,11 +1262,18 @@ int __init construct_dom0(
     regs->eip = parms.virt_entry;
     regs->esp = vstack_end;
     regs->esi = vstartinfo_start;
-    regs->eflags = X86_EFLAGS_IF;
+    regs->eflags = X86_EFLAGS_IF | 0x2;
 
     if ( opt_dom0_shadow )
+    {
+        if ( is_pvh_domain(d) )
+        {
+            printk("Invalid option dom0_shadow for PVH\n");
+            return -EINVAL;
+        }
         if ( paging_enable(d, PG_SH_enable) == 0 ) 
             paging_update_paging_modes(v);
+    }
 
     if ( supervisor_mode_kernel )
     {
@@ -1183,6 +1363,19 @@ int __init construct_dom0(
         printk(" Xen warning: dom0 kernel broken ELF: %s\n",
                elf_check_broken(&elf));
 
+    if ( is_pvh_domain(d) )
+    {
+        /* finally, fixup the page table, replacing mfns with pfns */
+        pvh_fixup_page_tables_for_hap(v, v_start);
+
+        /* the pt has correct pfn for si, now update the mfn in the p2m */
+        mfn = virt_to_mfn(d->shared_info);
+        pfn = shared_info_paddr >> PAGE_SHIFT;
+        dom0_update_physmap(d, pfn, mfn, 0);
+
+        pvh_map_all_iomem(d);
+    }
+
     iommu_dom0_init(dom0);
     return 0;
 
diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index fe7ca00..998827c 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -46,7 +46,7 @@ static int gdbsx_guest_mem_io(
     return (iop->remain ? -EFAULT : 0);
 }
 
-static long domctl_memory_mapping(struct domain *d, unsigned long gfn,
+long domctl_memory_mapping(struct domain *d, unsigned long gfn,
                            unsigned long mfn, unsigned long nr_mfns,
                            bool_t add)
 {
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index bff05d9..2f5a48b 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -580,6 +580,20 @@ int hap_domctl(struct domain *d, xen_domctl_shadow_op_t 
*sc,
     }
 }
 
+void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages)
+{
+    int rc;
+    unsigned long memkb = num_pages * (PAGE_SIZE / 1024);
+
+   /* Copied from: libxl_get_required_shadow_memory() */
+    memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024));
+    num_pages = ((memkb+1023)/1024) << (20 - PAGE_SHIFT);
+    paging_lock(d);
+    rc = hap_set_allocation(d, num_pages, NULL);
+    paging_unlock(d);
+    BUG_ON(rc);
+}
+
 static const struct paging_mode hap_paging_real_mode;
 static const struct paging_mode hap_paging_protected_mode;
 static const struct paging_mode hap_paging_pae_mode;
diff --git a/xen/include/asm-x86/hap.h b/xen/include/asm-x86/hap.h
index e03f983..aab8558 100644
--- a/xen/include/asm-x86/hap.h
+++ b/xen/include/asm-x86/hap.h
@@ -63,6 +63,7 @@ int   hap_track_dirty_vram(struct domain *d,
                            XEN_GUEST_HANDLE_64(uint8) dirty_bitmap);
 
 extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
+void hap_set_pvh_alloc_for_dom0(struct domain *d, unsigned long num_pages);
 
 #endif /* XEN_HAP_H */
 
diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
index a057069..6436bab 100644
--- a/xen/include/xen/domain.h
+++ b/xen/include/xen/domain.h
@@ -89,4 +89,7 @@ extern unsigned int xen_processor_pmbits;
 
 extern bool_t opt_dom0_vcpus_pin;
 
+extern long domctl_memory_mapping(struct domain *d, unsigned long gfn,
+                    unsigned long mfn, unsigned long nr_mfns, bool_t add_map);
+
 #endif /* __XEN_DOMAIN_H__ */
-- 
1.7.2.3


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.