[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 6 of 8 [RFC]] libxc: introduce xc_domain_move_memory



as a mechanism of deallocating and reallocating (immediately!) _all_
the memory of a domain. Notice it relies on the guest being suspended
already, before the function is invoked.

Of course, it is quite likely that the memory ends up in different
places from where it was before calling it but, for instance, the fact
that this is actually a different NUMA node (or anything else) does not
depend by any means from this function.

In fact, here the guest pages are just freed and immediately
re-allocated (you can see it as a very quick, back-to-back save-restore
cycle).

If the current domain configuration says, for instance, that new
allocations should go to a specific NUMA node, then the whole domain
is, as a matter of facts, moved there, but again, this is not
something this function does explicitly.

The way we do this is, very briefly, as follows:
 1. drop all the references to all the pages of a domain,
 2. backup the content of a batch of pages,
 3. deallocate the a batch,
 4. allocate a new set of pages for the batch,
 5. copy the backed up content in the new pages,
 6. if there are more pages, go back to 2, othwrwise
 7. update the page tables, the vcpu contexts, the P2M, etc.

The above raises a number of quite complex issues and, _not_all_
of them are being dealt with or solved in this series (RFC means
something after all, doesn't it? ;-P).

XXX Open issues are:
     - HVM ("easy" to add, but it's not in this patch. See the
            cover letter for the series);
     - PAE guests, as they need special attention for some of
       the page tables (should be trivial to add);
     - grant tables/granted pages: how to move them?
     - TMEM: how to "move" it?
     - shared/paged pages: what to do with them?
     - guest pages mapped in Xen, for instance:
        * vcpu info pages: moved but, how to update the mapping?
        * EOI page: moved but, how to update the mapping?

Signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx>

diff --git a/tools/libxc/Makefile b/tools/libxc/Makefile
--- a/tools/libxc/Makefile
+++ b/tools/libxc/Makefile
@@ -48,6 +48,11 @@ else
 GUEST_SRCS-y += xc_nomigrate.c
 endif
 
+# XXX: Well, for sure there are some X86-ism in the current code.
+#      Making it more ARM friendly should not be a big deal though,
+#      will do for next release.
+GUEST_SRCS-$(CONFIG_X86) += xc_domain_movemem.c
+
 vpath %.c ../../xen/common/libelf
 CFLAGS += -I../../xen/common/libelf
 
diff --git a/tools/libxc/xc_domain_movemem.c b/tools/libxc/xc_domain_movemem.c
new file mode 100644
--- /dev/null
+++ b/tools/libxc/xc_domain_movemem.c
@@ -0,0 +1,766 @@
+/******************************************************************************
+ * xc_domain_movemem.c
+ *
+ * Deallocate and reallocate all the memory of a domain.
+ *
+ * Copyright (c) 2013, Dario Faggioli.
+ * Copyright (c) 2012, Citrix Systems, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  
USA
+ */
+
+#include <inttypes.h>
+#include <time.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <xc_core.h>
+
+#include "xc_private.h"
+#include "xc_dom.h"
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+/* Needed from translation macros in xg_private.h */
+static struct domain_info_context _dinfo;
+static struct domain_info_context *dinfo = &_dinfo;
+
+#define MAX_BATCH_SIZE    1024
+#define MAX_PIN_BATCH     1024
+
+#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn, _max_mfn, _minfo, _m2p) \
+    (((_mfn) < (_max_mfn)) && ((mfn_to_pfn(_mfn, _m2p) < (_minfo).p2m_size) && 
\
+      (pfn_to_mfn(mfn_to_pfn(_mfn, _m2p), (_minfo).p2m_table, \
+                  (_minfo).guest_width) == (_mfn))))
+
+/*
+ * This is to determine which entries in this page table hold reserved
+ * hypervisor mappings. This depends on the current page table type as
+ * well as the number of paging levels (see also xc_domain_save.c).
+ *
+ * XXX: export this function so that it can be used both here and from
+ *      canonicalize_pagetable(), in xc_domain_save.c.
+ */
+static int is_xen_mapping(struct xc_domain_meminfo *minfo, unsigned long type,
+                          unsigned long hvirt_start, unsigned long m2p_mfn0,
+                          const void *spage, int pte)
+{
+    int xen_start, xen_end, pte_last;
+
+    xen_start = xen_end = pte_last = PAGE_SIZE / 8;
+
+    if ( (minfo->pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L3TAB) )
+        xen_start = L3_PAGETABLE_ENTRIES_PAE;
+
+    /*
+     * In PAE only the L2 mapping the top 1GB contains Xen mappings.
+     * We can spot this by looking for the guest's mappingof the m2p.
+     * Guests must ensure that this check will fail for other L2s.
+     */
+    if ( (minfo->pt_levels == 3) && (type == XEN_DOMCTL_PFINFO_L2TAB) )
+    {
+        int hstart;
+        uint64_t he;
+
+        hstart = (hvirt_start >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
+        he = ((const uint64_t *) spage)[hstart];
+
+        if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
+        {
+            /* hvirt starts with xen stuff... */
+            xen_start = hstart;
+        }
+        else if ( hvirt_start != 0xf5800000 )
+        {
+            /* old L2s from before hole was shrunk... */
+            hstart = (0xf5800000 >> L2_PAGETABLE_SHIFT_PAE) & 0x1ff;
+            he = ((const uint64_t *) spage)[hstart];
+            if ( ((he >> PAGE_SHIFT) & MFN_MASK_X86) == m2p_mfn0 )
+                xen_start = hstart;
+        }
+    }
+
+    if ( (minfo->pt_levels == 4) && (type == XEN_DOMCTL_PFINFO_L4TAB) )
+    {
+        /*
+         * XXX SMH: should compute these from hvirt_start (which we have)
+         * and hvirt_end (which we don't)
+         */
+        xen_start = 256;
+        xen_end   = 272;
+    }
+
+    return pte >= xen_start && pte < xen_end;
+}
+
+/*
+ * This function will basically deallocate _all_ the memory of a domain and
+ * reallocate it immediately. It relies on the guest being suspended
+ * already, before the function is even invoked.
+ *
+ * Of course, it is quite likely that the memory ends up in different places
+ * from where it was before calling this but, for instance, the fact that
+ * this is actually a different NUMA node (or anything else) does not
+ * depend by any means from this function. In fact, here the guest pages are
+ * just freed and immediately re-allocated (you can see it as a very quick,
+ * back-to-back domain_save--domain_restore). If the current domain
+ * configuration says, for instance, that new allocation should go to a
+ * different NUMA nodes, then the whole domain is moved to there, but again,
+ * this is not something this function does explicitly.
+ *
+ * If actually interested in doing something like that (i.e., moving the
+ * domain to a different NUMA node), calling xc_domain_node_setaffinity()
+ * right before this should achieve it.
+ */
+int xc_domain_move_memory(xc_interface *xch, uint32_t domid/*, int hvm*/)
+{
+    unsigned int i, j;
+    int rc = 1;
+
+    xc_dominfo_t info;
+    struct xc_domain_meminfo minfo;
+
+    struct mmuext_op pin[MAX_PIN_BATCH];
+    unsigned int nr_pins;
+
+    struct xc_mmu *mmu = NULL;
+    unsigned int xen_pt_levels, dom_guest_width;
+    unsigned long max_mfn, hvirt_start, m2p_mfn0;
+    vcpu_guest_context_any_t ctxt;
+
+    void *live_p2m_frame_list_list = NULL;
+    void *live_p2m_frame_list = NULL;
+
+    /*
+     * XXX: grant tables & granted pages need to be considered, e.g.,
+     *      using xc_is_page_granted_vX() in xc_offline_page.c to
+     *      recognise them, etc.
+    int gnt_num;
+    grant_entry_v1_t *gnttab_v1 = NULL;
+    grant_entry_v2_t *gnttab_v2 = NULL;
+     */
+
+    void *old_p, *new_p, *backup = NULL;
+    unsigned long mfn, pfn;
+    uint64_t fll;
+
+    xen_pfn_t *new_mfns= NULL, *old_mfns = NULL, *batch_pfns = NULL;
+    int pte_num = PAGE_SIZE / 8, cleared_pte = 0;
+    xen_pfn_t *m2p_table, *orig_m2p = NULL;
+    shared_info_any_t *live_shinfo = NULL;
+
+    unsigned long n = 0, n_skip = 0;
+
+    int debug = 0; /* XXX will become a parameter */
+
+    if ( !get_platform_info(xch, domid, &max_mfn, &hvirt_start,
+                            &xen_pt_levels, &dom_guest_width) )
+    {
+        ERROR("Failed getting platform info");
+        return 1;
+    }
+
+    /* We expect domain to be suspende already */
+    if ( xc_domain_getinfo(xch, domid, 1, &info) != 1 )
+    {
+        PERROR("Failed getting domain info");
+        return 1;
+    }
+    if ( !info.shutdown || info.shutdown_reason != SHUTDOWN_suspend)
+    {
+        PERROR("Domain appears not to be suspended");
+        return 1;
+    }
+
+    DBGPRINTF("Establishing the mappings for M2P and P2M");
+    memset(&minfo, 0, sizeof(minfo));
+    if ( !(m2p_table = xc_map_m2p(xch, max_mfn, PROT_READ, &m2p_mfn0)) )
+    {
+        PERROR("Failed to map the M2P table");
+        return 1;
+    }
+    if ( xc_map_domain_meminfo(xch, domid, &minfo) )
+    {
+        PERROR("Failed to map domain's memory information");
+        goto out;
+    }
+    dinfo->guest_width = minfo.guest_width;
+    dinfo->p2m_size = minfo.p2m_size;
+
+    /*
+     * XXX
+    DBGPRINTF("Mapping the grant tables");
+    gnttab_v2 = xc_gnttab_map_table_v2(xch, domid, &gnt_num);
+    if (!gnttab_v2)
+    {
+        PERROR("Failed to map V1 grant table... Trying V1");
+        gnttab_v1 = xc_gnttab_map_table_v1(xch, domid, &gnt_num);
+        if (!gnttab_v1)
+        {
+            PERROR("Failed to map grant table");
+            goto out;
+        }
+    }
+    DBGPRINTF("Grant table mapped. %d grants found", gnt_num);
+     */
+
+    mmu = xc_alloc_mmu_updates(xch, (domid+1)<<16|domid);
+    if ( mmu == NULL )
+    {
+        PERROR("Failed to allocate memory for MMU updates");
+        goto out;
+    }
+
+    /* Alloc support data structures */
+    new_mfns = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t));
+    old_mfns = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t));
+    batch_pfns = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t));
+
+    backup = malloc(PAGE_SIZE * MAX_BATCH_SIZE);
+
+    orig_m2p = calloc(max_mfn, sizeof(xen_pfn_t));
+
+    if ( !new_mfns || !old_mfns || !batch_pfns || !backup || !orig_m2p )
+    {
+        ERROR("Failed to allocate copying and/or backup data structures");
+        goto out;
+    }
+
+    DBGPRINTF("Saving the original M2P");
+    memcpy(orig_m2p, m2p_table, max_mfn * sizeof(xen_pfn_t));
+
+    DBGPRINTF("Starting deallocating and reallocating all memory for domain %d"
+              "\n\tnr_pages=%lu, nr_shared_pages=%lu, nr_paged_pages=%lu"
+              "\n\tnr_online_vcpus=%u, max_vcpu_id=%u",
+              domid, info.nr_pages, info.nr_shared_pages, info.nr_paged_pages,
+              info.nr_online_vcpus, info.max_vcpu_id);
+
+    /* Beware: no going back from this point!! */
+
+    /*
+     * As a part of the process of dropping all the references to the existing
+     * pages in memory, so that we can free (and then re-allocate them) we need
+     * to unpin them.
+     *
+     * We do that in batches of 1024 PFNs at each step, to amortize the cost
+     * of xc_mmuext_op() calls.
+     */
+    nr_pins = 0;
+    for ( i = 0; i < minfo.p2m_size; i++ )
+    {
+        if ( (minfo.pfn_type[i] & XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
+            continue;
+
+        pin[nr_pins].cmd = MMUEXT_UNPIN_TABLE;
+        pin[nr_pins].arg1.mfn = minfo.p2m_table[i];
+        nr_pins++;
+
+        if ( nr_pins == MAX_PIN_BATCH )
+        {
+            if ( xc_mmuext_op(xch, pin, nr_pins, domid) < 0 )
+            {
+                PERROR("Failed to unpin a batch of %d MFNs", nr_pins);
+                goto out;
+            }
+            else
+                DBGPRINTF("Unpinned a batch of %d MFNs", nr_pins);
+            nr_pins = 0;
+        }
+    }
+    if ( (nr_pins != 0) && (xc_mmuext_op(xch, pin, nr_pins, domid) < 0) )
+    {
+        PERROR("Failed to unpin a batch of %d MFNs", nr_pins);
+        goto out;
+    }
+    else
+        DBGPRINTF("Unpinned a batch of %d MFNs", nr_pins);
+
+    /*
+     * After unpinning, we also need to remove the _PAGE_PRESENT bit from
+     * the domain's PTEs, for the pages that we want to deallocate, or they
+     * just could not go away.
+     */
+    for (i = 0; i < minfo.p2m_size; i++)
+    {
+        void *content;
+        xen_pfn_t table_type, table_mfn = pfn_to_mfn(i, minfo.p2m_table,
+                                                     minfo.guest_width);
+
+        if ( table_mfn == INVALID_P2M_ENTRY ||
+             minfo.pfn_type[i] == XEN_DOMCTL_PFINFO_XTAB )
+        {
+            DBGPRINTF("Broken P2M entry at PFN 0x%x", i);
+            continue;
+        }
+
+        table_type = minfo.pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+        if ( table_type < XEN_DOMCTL_PFINFO_L1TAB ||
+             table_type > XEN_DOMCTL_PFINFO_L4TAB )
+            continue;
+
+        content = xc_map_foreign_range(xch, domid, PAGE_SIZE,
+                                       PROT_READ, table_mfn);
+        if ( !content )
+        {
+            PERROR("Failed to map the table at MFN 0x%lx", table_mfn);
+            goto out;
+        }
+
+        /* Go through each PTE of each table and clear the _PAGE_PRESENT bit */
+        for ( j = 0; j < pte_num; j++ )
+        {
+            uint64_t pte = ((uint64_t *)content)[j];
+
+            if ( !pte || is_xen_mapping(&minfo, table_type, hvirt_start, 
m2p_mfn0, content, j) )
+                continue;
+
+            if ( debug )
+                DBGPRINTF("Entry %d: PTE=0x%lx, MFN=0x%lx, PFN=0x%lx", j, pte,
+                          (uint64_t)((pte & MADDR_MASK_X86)>>PAGE_SHIFT),
+                          m2p_table[(unsigned long)((pte & MADDR_MASK_X86)
+                                                    >>PAGE_SHIFT)]);
+
+            pfn = m2p_table[(pte & MADDR_MASK_X86)>>PAGE_SHIFT];
+            pte &= ~_PAGE_PRESENT;
+
+            if ( xc_add_mmu_update(xch, mmu, table_mfn << PAGE_SHIFT |
+                              (j * (sizeof(uint64_t))) |
+                              MMU_PT_UPDATE_PRESERVE_AD, pte) )
+                PERROR("Failed to add some PTE update operation");
+            else
+                cleared_pte++;
+        }
+
+        if (content)
+            munmap(content, PAGE_SIZE);
+    }
+    if ( cleared_pte && xc_flush_mmu_updates(xch, mmu) )
+    {
+        PERROR("Failed flushing some PTE update operations");
+        goto out;
+    }
+    else
+        DBGPRINTF("Cleared presence for %d PTEs", cleared_pte);
+
+    /* Scan all the P2M ... */
+    while ( n < minfo.p2m_size )
+    {
+        /* ... But all operations are done in batches */
+        for ( i = 0; (i < MAX_BATCH_SIZE) && (n < minfo.p2m_size); n++ )
+        {
+            xen_pfn_t mfn = pfn_to_mfn(n, minfo.p2m_table, minfo.guest_width);
+            xen_pfn_t mfn_type = minfo.pfn_type[n] & 
XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+            if (mfn == INVALID_P2M_ENTRY || !is_mapped(mfn) )
+            {
+                if ( debug )
+                    DBGPRINTF("Skipping invalid or unmapped MFN 0x%lx", mfn);
+                n_skip++;
+                continue;
+            }
+            if ( mfn_type == XEN_DOMCTL_PFINFO_BROKEN ||
+                 mfn_type == XEN_DOMCTL_PFINFO_XTAB ||
+                 mfn_type == XEN_DOMCTL_PFINFO_XALLOC )
+            {
+                if ( debug )
+                    DBGPRINTF("Skippong broken or alloc only MFN 0x%lx", mfn);
+                n_skip++;
+                continue;
+            }
+
+            /*
+            if ( gnttab_v1 ?
+                 xc_is_page_granted_v1(xch, mfn, gnttab_v1, gnt_num) :
+                 xc_is_page_granted_v2(xch, mfn, gnttab_v2, gnt_num) )
+            {
+                n_skip++;
+                continue;
+            }
+             */
+
+            old_mfns[i] = mfn;
+            batch_pfns[i] = n;
+            i++;
+        }
+
+        /* Was the batch empty? */
+        if ( i == 0)
+            continue;
+
+        /*
+         * And now the core of the whole thing: map the PFNs in the batch,
+         * backup them, allocate new pages for them, and copy them there.
+         * We do this in this order, and we pass through a local backup,
+         * because we don't want to risk hitting the max_mem limit for
+         * the domain (which would be possible, depending on MAX_BATCH_SIZE,
+         * if we try to do it like allocate->copy->deallocate).
+         *
+         * With MAX_BATCH_SIZE of 1024 and 4K pages, this means we are moving
+         * 4MB of guest memory for each batch.
+         */
+
+        /* Map and backup */
+        old_p = xc_map_foreign_pages(xch, domid, PROT_READ, old_mfns, i);
+        if ( !old_p )
+        {
+            PERROR("Failed mapping the current MFNs\n");
+            goto out;
+        }
+        memcpy(backup, old_p, PAGE_SIZE * i);
+        munmap(old_p, PAGE_SIZE * i);
+
+        /* Deallocation and re-allocation */
+        if ( xc_domain_decrease_reservation(xch, domid, i, 0, old_mfns) != i ||
+             xc_domain_populate_physmap_exact(xch, domid, i, 0, 0, new_mfns) )
+        {
+            PERROR("Failed making space or allocating the new MFNs\n");
+            munmap(backup, PAGE_SIZE * i);
+            goto out;
+        }
+
+        /* Map of new pages, copy content and unmap */
+        new_p = xc_map_foreign_pages(xch, domid, PROT_WRITE, new_mfns, i);
+        if ( !new_p )
+        {
+            PERROR("Failed mapping the new MFNs\n");
+            munmap(backup, PAGE_SIZE * i);
+            goto out;
+        }
+        memcpy(new_p, backup, PAGE_SIZE * i);
+        munmap(new_p, PAGE_SIZE * i);
+        munmap(backup, PAGE_SIZE * i);
+
+        /*
+         * Since we already have the new MFNs, we can update both the M2P
+         * and the P2M right here, within this same loop.
+         */
+        for ( j = 0; j < i; j++ )
+        {
+            minfo.p2m_table[batch_pfns[j]] = new_mfns[j];
+            if ( xc_add_mmu_update(xch, mmu,
+                                   (((uint64_t)new_mfns[j]) << PAGE_SHIFT) |
+                                   MMU_MACHPHYS_UPDATE, batch_pfns[j]) )
+            {
+                PERROR("Failed updating M2P\n");
+                goto out;
+            }
+        }
+        if ( xc_flush_mmu_updates(xch, mmu) )
+        {
+            PERROR("Failed updating M2P\n");
+            goto out;
+        }
+
+        DBGPRINTF("Batch %lu/%ld done (%lu pages skipped)",
+                  n / MAX_BATCH_SIZE, minfo.p2m_size / MAX_BATCH_SIZE, n_skip);
+    }
+
+    /*
+     * Finally (oh, well...) update the PTEs of the domain again, putting
+     * the new MFNs there, and making the entries _PAGE_PRESENT again.
+     *
+     * This is a kind-of uncanonicalization, like it happens in save-resrote,
+     * although a very special one, and we rely on the snapshot of the M2P
+     * we made before starting all the deallocation/reallocation process.
+     */
+    for ( i = 0; i < minfo.p2m_size; i++ )
+    {
+        void *content;
+        xen_pfn_t table_type, table_mfn = pfn_to_mfn(i, minfo.p2m_table,
+                                                     minfo.guest_width);
+
+        table_type = minfo.pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+        if ( table_type < XEN_DOMCTL_PFINFO_L1TAB ||
+             table_type > XEN_DOMCTL_PFINFO_L4TAB )
+            continue;
+
+        /* We of course only care about tables */
+        content = xc_map_foreign_range(xch, domid, PAGE_SIZE,
+                                       PROT_WRITE, table_mfn);
+        if ( !content )
+        {
+            PERROR("Failed to map the table at MFN 0x%lx", table_mfn);
+            continue;
+        }
+
+        for ( j = 0; j < PAGE_SIZE / 8; j++ )
+        {
+            uint64_t pte = ((uint64_t *)content)[j];
+
+            if ( !pte || is_xen_mapping(&minfo, table_type, hvirt_start, 
m2p_mfn0, content, j) )
+                continue;
+
+            /*
+             * Basically, we lookup the PFN from the snapshoted M2P and we
+             * pick up the new MFN from the P2M (since we updated it "live"
+             * during the re-allocation phase above).
+             */
+            mfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86;
+            pfn = orig_m2p[mfn];
+
+            if ( debug )
+                DBGPRINTF("Table[PTE]: 0x%lx[%d] ==> orig_m2p[0x%lx]=0x%lx, "
+                          "p2m[0x%lx]=0x%lx // pte: 0x%lx --> 0x%lx",
+                          table_mfn, j, mfn, pfn, pfn, minfo.p2m_table[pfn],
+                          pte,  (uint64_t)((pte & ~MADDR_MASK_X86)|
+                                           (minfo.p2m_table[pfn]<<PAGE_SHIFT)|
+                                            _PAGE_PRESENT));
+
+            mfn = minfo.p2m_table[pfn];
+            pte &= ~MADDR_MASK_X86;
+            pte |= (uint64_t)mfn << PAGE_SHIFT;
+            pte |= _PAGE_PRESENT;
+
+            ((uint64_t *)content)[j] = pte;
+
+            if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn, max_mfn, minfo, m2p_table) )
+            {
+                ERROR("Failed updating entry %d in table at MFN 0x%lx", j, 
table_mfn);
+                continue; // XXX
+            }
+        }
+
+        if ( content )
+            munmap(content, PAGE_SIZE);
+    }
+
+    DBGPRINTF("Re-pinning page table MFNs");
+
+    /* Pin the able types again */
+    nr_pins = 0;
+    for ( i = 0; i < minfo.p2m_size; i++ )
+    {
+        if ( (minfo.pfn_type[i] & XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
+            continue;
+
+        switch ( minfo.pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
+        {
+        case XEN_DOMCTL_PFINFO_L1TAB:
+            pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
+            break;
+
+        case XEN_DOMCTL_PFINFO_L2TAB:
+            pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
+            break;
+
+        case XEN_DOMCTL_PFINFO_L3TAB:
+            pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
+            break;
+
+        case XEN_DOMCTL_PFINFO_L4TAB:
+            pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
+            break;
+        default:
+            continue;
+        }
+        pin[nr_pins].arg1.mfn = minfo.p2m_table[i];
+        nr_pins++;
+
+        if ( nr_pins == MAX_PIN_BATCH )
+        {
+            if ( xc_mmuext_op(xch, pin, nr_pins, domid) < 0 )
+            {
+                PERROR("Failed to pin a batch of %d MFNs", nr_pins);
+                goto out;
+            }
+            else
+                DBGPRINTF("Re-pinned a batch of %d MFNs", nr_pins);
+            nr_pins = 0;
+        }
+    }
+    if ( (nr_pins != 0) && (xc_mmuext_op(xch, pin, nr_pins, domid) < 0) )
+    {
+        PERROR("Failed to pin batch of %d page tables", nr_pins);
+        goto out;
+    }
+    else
+        DBGPRINTF("Re-pinned a batch of %d MFNs", nr_pins);
+
+    /*
+     * Now, take care of the vCPUs contextes. It all happens as above,
+     * we use the original M2P and the new domain's P2M to update all
+     * the various references.
+     */
+    for ( i = 0; i <= info.max_vcpu_id; i++ )
+    {
+        xc_vcpuinfo_t vinfo;
+
+        DBGPRINTF("Adjusting context for VCPU%d", i);
+
+        if ( xc_vcpu_getinfo(xch, domid, i, &vinfo) )
+        {
+            PERROR("Failed getting info for VCPU%d", i);
+            goto out;
+        }
+        if ( !vinfo.online )
+        {
+            DBGPRINTF("VCPU%d seems offline", i);
+            continue;
+        }
+
+        if ( xc_vcpu_getcontext(xch, domid, i, &ctxt) )
+        {
+            PERROR("No context for VCPU%d", i);
+            goto out;
+        }
+
+        if ( i == 0 )
+        {
+            //start_info_any_t *start_info;
+
+            /*
+             * Update the start info frame number. It is the 3rd argument
+             * to the HYPERVISOR_sched_op hypercall when op is
+             * SCHEDOP_shutdown and reason is SHUTDOWN_suspend, so we find
+             * it in EDX.
+             */
+            mfn = GET_FIELD(&ctxt, user_regs.edx);
+            mfn = minfo.p2m_table[mfn_to_pfn(mfn, orig_m2p)];
+            SET_FIELD(&ctxt, user_regs.edx, mfn);
+
+            /*
+             * XXX: I checke, and store_mfn and console_mfn seemed ok, at
+             *      least from a 'mapping' point of view, but more testing is
+             *      needed.
+            start_info = xc_map_foreign_range(xch, domid, PAGE_SIZE, PROT_READ 
| PROT_WRITE, mfn);
+            munmap(start_info, PAGE_SIZE);
+             */
+        }
+
+        /* GDT pointing MFNs */
+        for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents); j++ )
+        {
+            mfn = GET_FIELD(&ctxt, gdt_frames[j]);
+            mfn = minfo.p2m_table[mfn_to_pfn(mfn, orig_m2p)];
+            SET_FIELD(&ctxt, gdt_frames[j], mfn);
+        }
+
+        /* CR3 XXX: PAE needs special attenion here, I think */
+        mfn = UNFOLD_CR3(GET_FIELD(&ctxt, ctrlreg[3]));
+        mfn = minfo.p2m_table[mfn_to_pfn(mfn, orig_m2p)];
+        SET_FIELD(&ctxt, ctrlreg[3], FOLD_CR3(mfn));
+
+        /* Guest pagetable (x86/64) in CR1 */
+        if ( (minfo.pt_levels == 4) && ctxt.x64.ctrlreg[1] )
+        {
+            /*
+             * XXX: save-restore code mangle with the least-significant
+             *      bit ('valid PFN'). This should not be needed in here.
+             */
+            mfn = UNFOLD_CR3(ctxt.x64.ctrlreg[1]);
+            mfn = minfo.p2m_table[mfn_to_pfn(mfn, orig_m2p)];
+            ctxt.x64.ctrlreg[1] = FOLD_CR3(mfn);
+        }
+
+        /*
+         * XXX: Xen refuses to set a new context for an existing vCPU if
+         *      things like CR3, the GDTs have changed, even if the domain
+         *      is suspended. Going through re-initializing the vCPU (by
+         *      this one call below with a NULL ctxt) makes it possible,
+         *      but is that sensible? And even if yes, is that the following
+         *      _setcontext call issued below enough?
+         */
+        if ( xc_vcpu_setcontext(xch, domid, i, NULL) )
+        {
+            PERROR("Failed re-initialising VCPU%d", i);
+            goto out;
+        }
+        if ( xc_vcpu_setcontext(xch, domid, i, &ctxt) )
+        {
+            PERROR("Failed when updating context for VCPU%d", i);
+            goto out;
+        }
+    }
+
+    /*
+     * Finally (an this time for real), we take care of the pages mapping
+     * the P2M, and of the P2M entries themselves.
+     */
+
+    live_shinfo = xc_map_foreign_range(xch, domid,
+                     PAGE_SIZE, PROT_READ|PROT_WRITE, info.shared_info_frame);
+    if ( !live_shinfo )
+    {
+        PERROR("Failed mapping live_shinfo");
+        goto out;
+    }
+
+    fll = GET_FIELD(live_shinfo, arch.pfn_to_mfn_frame_list_list);
+    fll = minfo.p2m_table[mfn_to_pfn(fll, orig_m2p)];
+    live_p2m_frame_list_list = xc_map_foreign_range(xch, domid, PAGE_SIZE,
+                                                    PROT_READ|PROT_WRITE, fll);
+    if ( !live_p2m_frame_list_list )
+    {
+        PERROR("Couldn't map live_p2m_frame_list_list");
+        goto out;
+    }
+    SET_FIELD(live_shinfo, arch.pfn_to_mfn_frame_list_list, fll);
+
+    /* First, update the frames caontaining the list of the P2M frames */
+    for ( i = 0; i < P2M_FLL_ENTRIES; i++ )
+    {
+
+        mfn = ((uint64_t *)live_p2m_frame_list_list)[i];
+        mfn = minfo.p2m_table[mfn_to_pfn(mfn, orig_m2p)];
+        ((uint64_t *)live_p2m_frame_list_list)[i] = mfn;
+    }
+
+    live_p2m_frame_list =
+        xc_map_foreign_pages(xch, domid, PROT_READ|PROT_WRITE,
+                             live_p2m_frame_list_list,
+                             P2M_FLL_ENTRIES);
+    if ( !live_p2m_frame_list )
+    {
+        PERROR("Couldn't map live_p2m_frame_list");
+        goto out;
+    }
+
+    /* And then update the actual entries of it */
+    for ( i = 0; i < P2M_FL_ENTRIES; i++ )
+    {
+        mfn = ((uint64_t *)live_p2m_frame_list)[i];
+        mfn = minfo.p2m_table[mfn_to_pfn(mfn, orig_m2p)];
+        ((uint64_t *)live_p2m_frame_list)[i] = mfn;
+    }
+
+    rc = 0;
+
+ out:
+    if ( live_p2m_frame_list_list )
+        munmap(live_p2m_frame_list_list, PAGE_SIZE);
+    if ( live_p2m_frame_list )
+        munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE);
+    if ( live_shinfo )
+        munmap(live_shinfo, PAGE_SIZE);
+
+    free(mmu);
+    free(new_mfns);
+    free(old_mfns);
+    free(batch_pfns );
+    free(backup);
+    free(orig_m2p);
+
+    /*
+    if (gnttab_v1)
+        munmap(gnttab_v1, gnt_num / (PAGE_SIZE/sizeof(grant_entry_v1_t)));
+    if (gnttab_v2)
+        munmap(gnttab_v2, gnt_num / (PAGE_SIZE/sizeof(grant_entry_v2_t)));
+     */
+
+    xc_unmap_domain_meminfo(xch, &minfo);
+    munmap(m2p_table, M2P_SIZE(max_mfn));
+
+    return !!rc;
+}
diff --git a/tools/libxc/xenguest.h b/tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h
+++ b/tools/libxc/xenguest.h
@@ -272,6 +272,15 @@ int xc_query_page_offline_status(xc_inte
 
 int xc_exchange_page(xc_interface *xch, int domid, xen_pfn_t mfn);
 
+/**
+ * This function deallocates all the guests memory and allocates it
+ * again and immediately, with the net effect of moving it somewhere
+ * else wrt where it is when the function is invoked.
+ *
+ * @param xch a handle to an open hypervisor interface.
+ * @param domid the domain id one wants to move the memory of.
+ */
+int xc_domain_move_memory(xc_interface *xch, uint32_t domid/*, int hvm*/);
 
 /**
  * Memory related information, such as PFN types, the P2M table,
diff --git a/tools/libxc/xg_private.h b/tools/libxc/xg_private.h
--- a/tools/libxc/xg_private.h
+++ b/tools/libxc/xg_private.h
@@ -145,6 +145,11 @@ static inline xen_pfn_t pfn_to_mfn(xen_p
                             (((uint32_t *)p2m)[(pfn)]))));
 }
 
+static inline xen_pfn_t mfn_to_pfn(xen_pfn_t mfn, xen_pfn_t *m2p)
+{
+    return m2p[mfn];
+}
+
 /* Number of xen_pfn_t in a page */
 #define FPP             (PAGE_SIZE/(dinfo->guest_width))
 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.