[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [Patch v6 13/13] tools/libxc: noarch restore code



Restore a domain from the new format.  This reads and validates the domain and
image header and loads the guest memory from the PAGE_DATA records, populating
the p2m as it does so.

This provides the xc_domain_restore2() function as an alternative to the
existing xc_domain_restore().

Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>

---
v6:
 * Fix error path with rc = 0.
 * Fix undefined memory issue with creating the populated_pfns array.
---
 tools/libxc/saverestore/common.h  |    6 +
 tools/libxc/saverestore/restore.c |  628 ++++++++++++++++++++++++++++++++++++-
 2 files changed, 633 insertions(+), 1 deletion(-)

diff --git a/tools/libxc/saverestore/common.h b/tools/libxc/saverestore/common.h
index 4894cac..4840d3f 100644
--- a/tools/libxc/saverestore/common.h
+++ b/tools/libxc/saverestore/common.h
@@ -289,6 +289,12 @@ int arch_write_saving_cpu(struct xc_sr_context *ctx);
  */
 int arch_handle_saving_cpu(struct xc_sr_context *ctx, struct xc_sr_record 
*rec);
 
+/* TODO - find a better way of hiding this.  It should be private to
+ * restore.c, but is needed by x86_pv_localise_page()
+ */
+int populate_pfns(struct xc_sr_context *ctx, unsigned count,
+                  const xen_pfn_t *original_pfns, const uint32_t *types);
+
 #endif
 /*
  * Local variables:
diff --git a/tools/libxc/saverestore/restore.c 
b/tools/libxc/saverestore/restore.c
index 6624baa..4123d3a 100644
--- a/tools/libxc/saverestore/restore.c
+++ b/tools/libxc/saverestore/restore.c
@@ -1,5 +1,570 @@
+#include <arpa/inet.h>
+
 #include "common.h"
 
+/*
+ * Read and validate the Image and Domain headers.
+ */
+static int read_headers(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_ihdr ihdr;
+    struct xc_sr_dhdr dhdr;
+
+    if ( read_exact(ctx->fd, &ihdr, sizeof(ihdr)) )
+    {
+        PERROR("Failed to read Image Header from stream");
+        return -1;
+    }
+
+    ihdr.id      = ntohl(ihdr.id);
+    ihdr.version = ntohl(ihdr.version);
+    ihdr.options = ntohs(ihdr.options);
+
+    if ( ihdr.marker != IHDR_MARKER )
+    {
+        ERROR("Invalid marker: Got 0x%016"PRIx64, ihdr.marker);
+        return -1;
+    }
+    else if ( ihdr.id != IHDR_ID )
+    {
+        ERROR("Invalid ID: Expected 0x%08"PRIx32", Got 0x%08"PRIx32,
+              IHDR_ID, ihdr.id);
+        return -1;
+    }
+    else if ( ihdr.version != IHDR_VERSION )
+    {
+        ERROR("Invalid Version: Expected %d, Got %d", ihdr.version, 
IHDR_VERSION);
+        return -1;
+    }
+    else if ( ihdr.options & IHDR_OPT_BIG_ENDIAN )
+    {
+        ERROR("Unable to handle big endian streams");
+        return -1;
+    }
+
+    ctx->restore.format_version = ihdr.version;
+
+    if ( read_exact(ctx->fd, &dhdr, sizeof(dhdr)) )
+    {
+        PERROR("Failed to read Domain Header from stream");
+        return -1;
+    }
+
+    ctx->restore.guest_type = dhdr.type;
+    ctx->restore.guest_page_size = (1U << dhdr.page_shift);
+
+    if ( dhdr.xen_major == 0 )
+    {
+        IPRINTF("Found %s domain, converted from legacy stream format",
+                dhdr_type_to_str(dhdr.type));
+        DPRINTF("  Legacy conversion script version %u", dhdr.xen_minor);
+    }
+    else
+        IPRINTF("Found %s domain from Xen %u.%u",
+                dhdr_type_to_str(dhdr.type), dhdr.xen_major, dhdr.xen_minor);
+    return 0;
+}
+
+/**
+ * Reads a record from the stream, and fills in the record structure.
+ *
+ * Returns 0 on success and non-0 on failure.
+ *
+ * On success, the records type and size shall be valid.
+ * - If size is 0, data shall be NULL.
+ * - If size is non-0, data shall be a buffer allocated by malloc() which must
+ *   be passed to free() by the caller.
+ *
+ * On failure, the contents of the record structure are undefined.
+ */
+static int read_record(struct xc_sr_context *ctx, struct xc_sr_record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_rhdr rhdr;
+    size_t datasz;
+
+    if ( read_exact(ctx->fd, &rhdr, sizeof(rhdr)) )
+    {
+        PERROR("Failed to read Record Header from stream");
+        return -1;
+    }
+    else if ( rhdr.length > REC_LENGTH_MAX )
+    {
+        ERROR("Record (0x%08"PRIx32", %s) length 0x%"PRIx32
+              " exceeds max (0x%"PRIx32")",
+              rhdr.type, rec_type_to_str(rhdr.type),
+              rhdr.length, REC_LENGTH_MAX);
+        return -1;
+    }
+
+    datasz = ROUNDUP(rhdr.length, REC_ALIGN_ORDER);
+
+    if ( datasz )
+    {
+        rec->data = malloc(datasz);
+
+        if ( !rec->data )
+        {
+            ERROR("Unable to allocate %zu bytes for record data 
(0x%08"PRIx32", %s)",
+                  datasz, rhdr.type, rec_type_to_str(rhdr.type));
+            return -1;
+        }
+
+        if ( read_exact(ctx->fd, rec->data, datasz) )
+        {
+            free(rec->data);
+            rec->data = NULL;
+            PERROR("Failed to read %zu bytes of data for record 
(0x%08"PRIx32", %s)",
+                   datasz, rhdr.type, rec_type_to_str(rhdr.type));
+            return -1;
+        }
+    }
+    else
+        rec->data = NULL;
+
+    rec->type   = rhdr.type;
+    rec->length = rhdr.length;
+
+    return 0;
+};
+
+/*
+ * Is a pfn populated?
+ */
+static bool pfn_is_populated(const struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    if ( pfn > ctx->restore.max_populated_pfn )
+        return false;
+    return test_bit(pfn, ctx->restore.populated_pfns);
+}
+
+/*
+ * Set a pfn as populated, expanding the tracking structures if needed. To
+ * avoid realloc()ing too excessivly, the size increased to the nearest power
+ * of two large enough to contain the required pfn.
+ */
+static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    xc_interface *xch = ctx->xch;
+
+    if ( pfn > ctx->restore.max_populated_pfn )
+    {
+        xen_pfn_t new_max;
+        size_t old_sz, new_sz;
+        unsigned long *p;
+
+        /* Round up to the nearest power of two larger than pfn, less 1. */
+        new_max = pfn;
+        new_max |= new_max >> 1;
+        new_max |= new_max >> 2;
+        new_max |= new_max >> 4;
+        new_max |= new_max >> 8;
+        new_max |= new_max >> 16;
+        if ( sizeof(xen_pfn_t) >= 8 )
+             new_max |= new_max >> 32;
+
+        old_sz = bitmap_size(ctx->restore.max_populated_pfn + 1);
+        new_sz = bitmap_size(new_max + 1);
+        p = realloc(ctx->restore.populated_pfns, new_sz);
+        if ( !p )
+        {
+            ERROR("Failed to realloc populated bitmap");
+            errno = ENOMEM;
+            return -1;
+        }
+
+        memset((uint8_t *)p + old_sz, 0x00, new_sz - old_sz);
+
+        ctx->restore.populated_pfns    = p;
+        ctx->restore.max_populated_pfn = new_max;
+    }
+
+    set_bit(pfn, ctx->restore.populated_pfns);
+
+    return 0;
+}
+
+/*
+ * Given a set of pfns, obtain memory from Xen to fill the physmap for the
+ * unpopulated subset.
+ */
+int populate_pfns(struct xc_sr_context *ctx, unsigned count,
+                  const xen_pfn_t *original_pfns, const uint32_t *types)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t *mfns = malloc(count * sizeof(*mfns)),
+        *pfns = malloc(count * sizeof(*pfns));
+    unsigned i, nr_pfns = 0;
+    int rc = -1;
+
+    if ( !mfns || !pfns )
+    {
+        ERROR("Failed to allocate %zu bytes for populating the physmap",
+              2 * count * sizeof(*mfns));
+        goto err;
+    }
+
+    for ( i = 0; i < count; ++i )
+    {
+        if ( types[i] != XEN_DOMCTL_PFINFO_XTAB &&
+             types[i] != XEN_DOMCTL_PFINFO_BROKEN &&
+             !pfn_is_populated(ctx, original_pfns[i]) )
+        {
+            pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i];
+            ++nr_pfns;
+        }
+    }
+
+    if ( nr_pfns )
+    {
+        rc = xc_domain_populate_physmap_exact(xch, ctx->domid, nr_pfns, 0, 0, 
mfns);
+        if ( rc )
+        {
+            PERROR("Failed to populate physmap");
+            goto err;
+        }
+
+        for ( i = 0; i < nr_pfns; ++i )
+        {
+            if ( mfns[i] == INVALID_MFN )
+            {
+                ERROR("Populate physmap failed for pfn %u", i);
+                rc = -1;
+                goto err;
+            }
+
+            rc = pfn_set_populated(ctx, pfns[i]);
+            if ( rc )
+                goto err;
+            ctx->restore.ops.set_gfn(ctx, pfns[i], mfns[i]);
+        }
+    }
+
+    rc = 0;
+
+ err:
+    free(pfns);
+    free(mfns);
+
+    return rc;
+}
+
+/*
+ * Given a list of pfns, their types, and a block of page data from the
+ * stream, populate and record their types, map the relevent subset and copy
+ * the data into the guest.
+ */
+static int process_page_data(struct xc_sr_context *ctx, unsigned count,
+                             xen_pfn_t *pfns, uint32_t *types, void *page_data)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t *mfns = malloc(count * sizeof(*mfns));
+    int *map_errs = malloc(count * sizeof(*map_errs));
+    int rc;
+    void *mapping = NULL, *guest_page = NULL;
+    unsigned i,    /* i indexes the pfns from the record. */
+        j,         /* j indexes the subset of pfns we decide to map. */
+        nr_pages;
+
+    if ( !mfns || !map_errs )
+    {
+        rc = -1;
+        ERROR("Failed to allocate %zu bytes to process page data",
+              count * (sizeof(*mfns) + sizeof(*map_errs)));
+        goto err;
+    }
+
+    rc = populate_pfns(ctx, count, pfns, types);
+    if ( rc )
+    {
+        ERROR("Failed to populate pfns for batch of %u pages", count);
+        goto err;
+    }
+
+    for ( i = 0, nr_pages = 0; i < count; ++i )
+    {
+        ctx->restore.ops.set_page_type(ctx, pfns[i], types[i]);
+
+        switch ( types[i] )
+        {
+        case XEN_DOMCTL_PFINFO_NOTAB:
+
+        case XEN_DOMCTL_PFINFO_L1TAB:
+        case XEN_DOMCTL_PFINFO_L1TAB | XEN_DOMCTL_PFINFO_LPINTAB:
+
+        case XEN_DOMCTL_PFINFO_L2TAB:
+        case XEN_DOMCTL_PFINFO_L2TAB | XEN_DOMCTL_PFINFO_LPINTAB:
+
+        case XEN_DOMCTL_PFINFO_L3TAB:
+        case XEN_DOMCTL_PFINFO_L3TAB | XEN_DOMCTL_PFINFO_LPINTAB:
+
+        case XEN_DOMCTL_PFINFO_L4TAB:
+        case XEN_DOMCTL_PFINFO_L4TAB | XEN_DOMCTL_PFINFO_LPINTAB:
+
+            mfns[nr_pages++] = ctx->restore.ops.pfn_to_gfn(ctx, pfns[i]);
+            break;
+        }
+
+    }
+
+    if ( nr_pages > 0 )
+    {
+        mapping = guest_page = xc_map_foreign_bulk(
+            xch, ctx->domid, PROT_READ | PROT_WRITE,
+            mfns, map_errs, nr_pages);
+        if ( !mapping )
+        {
+            rc = -1;
+            PERROR("Unable to map %u mfns for %u pages of data",
+                   nr_pages, count);
+            goto err;
+        }
+    }
+
+    for ( i = 0, j = 0; i < count; ++i )
+    {
+        switch ( types[i] )
+        {
+        case XEN_DOMCTL_PFINFO_XTAB:
+        case XEN_DOMCTL_PFINFO_BROKEN:
+        case XEN_DOMCTL_PFINFO_XALLOC:
+            /* No page data to deal with. */
+            continue;
+        }
+
+        if ( map_errs[j] )
+        {
+            rc = -1;
+            ERROR("Mapping pfn %lx (mfn %lx, type %#"PRIx32")failed with %d",
+                  pfns[i], mfns[j], types[i], map_errs[j]);
+            goto err;
+        }
+
+        /* Undo page normalisation done by the saver. */
+        rc = ctx->restore.ops.localise_page(ctx, types[i], page_data);
+        if ( rc )
+        {
+            DPRINTF("Failed to localise");
+            goto err;
+        }
+
+        if ( ctx->restore.verify )
+        {
+            /* Verify mode - compare incoming data to what we already have. */
+            if ( memcmp(guest_page, page_data, PAGE_SIZE) )
+                ERROR("verify pfn %lx failed (type %#x)",
+                      pfns[i], types[i] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT);
+        }
+        else
+        {
+            /* Regular mode - copy incoming data into place. */
+            memcpy(guest_page, page_data, PAGE_SIZE);
+        }
+
+        ++j;
+        guest_page += PAGE_SIZE;
+        page_data += PAGE_SIZE;
+    }
+
+    rc = 0;
+
+ err:
+    if ( mapping )
+        munmap(mapping, nr_pages * PAGE_SIZE);
+
+    free(map_errs);
+    free(mfns);
+
+    return rc;
+}
+
+/*
+ * Validate a PAGE_DATA record from the stream, and pass the results to
+ * process_page_data() to actually perform the legwork.
+ */
+static int handle_page_data(struct xc_sr_context *ctx, struct xc_sr_record 
*rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_rec_page_data_header *pages = rec->data;
+    unsigned i, pages_of_data = 0;
+    int rc = -1;
+
+    xen_pfn_t *pfns = NULL, pfn;
+    uint32_t *types = NULL, type;
+
+    if ( rec->length < sizeof(*pages) )
+    {
+        ERROR("PAGE_DATA record truncated: length %"PRIu32", min %zu",
+              rec->length, sizeof(*pages));
+        goto err;
+    }
+    else if ( pages->count < 1 )
+    {
+        ERROR("Expected at least 1 pfn in PAGE_DATA record");
+        goto err;
+    }
+    else if ( rec->length < sizeof(*pages) + (pages->count * sizeof(uint64_t)) 
)
+    {
+        ERROR("PAGE_DATA record (length %"PRIu32") too short to contain %"
+              PRIu32" pfns worth of information", rec->length, pages->count);
+        goto err;
+    }
+
+    pfns = malloc(pages->count * sizeof(*pfns));
+    types = malloc(pages->count * sizeof(*types));
+    if ( !pfns || !types )
+    {
+        ERROR("Unable to allocate enough memory for %"PRIu32" pfns",
+              pages->count);
+        goto err;
+    }
+
+    for ( i = 0; i < pages->count; ++i )
+    {
+        pfn = pages->pfn[i] & PAGE_DATA_PFN_MASK;
+        if ( !ctx->restore.ops.pfn_is_valid(ctx, pfn) )
+        {
+            ERROR("pfn %#lx (index %u) outside domain maximum", pfn, i);
+            goto err;
+        }
+
+        type = (pages->pfn[i] & PAGE_DATA_TYPE_MASK) >> 32;
+        if ( ((type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT) >= 5) &&
+             ((type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT) <= 8) )
+        {
+            ERROR("Invalid type %#"PRIx32" for pfn %#lx (index %u)", type, 
pfn, i);
+            goto err;
+        }
+        else if ( type < XEN_DOMCTL_PFINFO_BROKEN )
+            /* NOTAB and all L1 thru L4 tables (including pinned) should have
+             * a page worth of data in the record. */
+            pages_of_data++;
+
+        pfns[i] = pfn;
+        types[i] = type;
+    }
+
+    if ( rec->length != (sizeof(*pages) +
+                         (sizeof(uint64_t) * pages->count) +
+                         (PAGE_SIZE * pages_of_data)) )
+    {
+        ERROR("PAGE_DATA record wrong size: length %"PRIu32", expected "
+              "%zu + %zu + %zu", rec->length, sizeof(*pages),
+              (sizeof(uint64_t) * pages->count), (PAGE_SIZE * pages_of_data));
+        goto err;
+    }
+
+    rc = process_page_data(ctx, pages->count, pfns, types,
+                           &pages->pfn[pages->count]);
+ err:
+    free(types);
+    free(pfns);
+
+    return rc;
+}
+
+/*
+ * Restore a domain.
+ */
+static int restore(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    struct xc_sr_record rec;
+    int rc, saved_rc = 0, saved_errno = 0;
+
+    IPRINTF("Restoring domain");
+
+    rc = ctx->restore.ops.setup(ctx);
+    if ( rc )
+        goto err;
+
+    ctx->restore.max_populated_pfn = (32 * 1024 / 4) - 1;
+    ctx->restore.populated_pfns = bitmap_alloc(
+        ctx->restore.max_populated_pfn + 1);
+    if ( !ctx->restore.populated_pfns )
+    {
+        ERROR("Unable to allocate memory for populated_pfns bitmap");
+        goto err;
+    }
+
+    do
+    {
+        rc = read_record(ctx, &rec);
+        if ( rc )
+            goto err;
+
+        switch ( rec.type )
+        {
+        case REC_TYPE_END:
+            DPRINTF("End record");
+            break;
+
+        case REC_TYPE_PAGE_DATA:
+            rc = handle_page_data(ctx, &rec);
+            break;
+
+        case REC_TYPE_VERIFY:
+            DPRINTF("Verify mode enabled");
+            ctx->restore.verify = true;
+            break;
+
+        case REC_TYPE_SAVING_CPU:
+            rc = arch_handle_saving_cpu(ctx, &rec);
+            break;
+
+        default:
+            rc = ctx->restore.ops.process_record(ctx, &rec);
+            break;
+        }
+
+        free(rec.data);
+
+        if ( rc == RECORD_NOT_PROCESSED )
+        {
+            if ( rec.type & REC_TYPE_OPTIONAL )
+                DPRINTF("Ignoring optional record %#x (%s)",
+                        rec.type, rec_type_to_str(rec.type));
+            else
+            {
+                ERROR("Manditory record %#x (%s) not handled",
+                      rec.type, rec_type_to_str(rec.type));
+                rc = -1;
+            }
+        }
+
+        if ( rc )
+            goto err;
+
+    } while ( rec.type != REC_TYPE_END );
+
+    rc = ctx->restore.ops.stream_complete(ctx);
+    if ( rc )
+        goto err;
+
+    IPRINTF("Restore successful");
+    goto done;
+
+ err:
+    saved_errno = errno;
+    saved_rc = rc;
+    PERROR("Restore failed");
+
+ done:
+    free(ctx->restore.populated_pfns);
+    rc = ctx->restore.ops.cleanup(ctx);
+    if ( rc )
+        PERROR("Failed to clean up");
+
+    if ( saved_rc )
+    {
+        rc = saved_rc;
+        errno = saved_errno;
+    }
+
+    return rc;
+}
+
 int xc_domain_restore2(xc_interface *xch, int io_fd, uint32_t dom,
                        unsigned int store_evtchn, unsigned long *store_mfn,
                        domid_t store_domid, unsigned int console_evtchn,
@@ -8,8 +573,69 @@ int xc_domain_restore2(xc_interface *xch, int io_fd, 
uint32_t dom,
                        int checkpointed_stream,
                        struct restore_callbacks *callbacks)
 {
+    struct xc_sr_context ctx =
+        {
+            .xch = xch,
+            .fd = io_fd,
+        };
+
+    /* GCC 4.4 (of CentOS 6.x vintage) can' t initialise anonymous unions :( */
+    ctx.restore.console_evtchn = console_evtchn;
+    ctx.restore.console_domid = console_domid;
+    ctx.restore.xenstore_evtchn = store_evtchn;
+    ctx.restore.xenstore_domid = store_domid;
+    ctx.restore.callbacks = callbacks;
+
     IPRINTF("In experimental %s", __func__);
-    return -1;
+    DPRINTF("fd %d, dom %"PRIu32", hvm %u, pae %u, superpages %d"
+            ", checkpointed_stream %d", io_fd, dom, hvm, pae,
+            superpages, checkpointed_stream);
+
+    if ( xc_domain_getinfo(xch, dom, 1, &ctx.dominfo) != 1 )
+    {
+        PERROR("Failed to get domain info");
+        return -1;
+    }
+
+    if ( ctx.dominfo.domid != dom )
+    {
+        ERROR("Domain %"PRIu32" does not exist", dom);
+        return -1;
+    }
+
+    ctx.domid = dom;
+    IPRINTF("Restoring domain %"PRIu32, dom);
+
+    if ( read_headers(&ctx) )
+        return -1;
+
+    if ( ctx.dominfo.hvm )
+    {
+        ctx.restore.ops = restore_ops_x86_hvm;
+        if ( restore(&ctx) )
+            return -1;
+    }
+    else
+    {
+        ctx.restore.ops = restore_ops_x86_pv;
+        if ( restore(&ctx) )
+            return -1;
+    }
+
+    DPRINTF("XenStore: mfn %#lx, dom %d, evt %u",
+            ctx.restore.xenstore_mfn,
+            ctx.restore.xenstore_domid,
+            ctx.restore.xenstore_evtchn);
+
+    DPRINTF("Console: mfn %#lx, dom %d, evt %u",
+            ctx.restore.console_mfn,
+            ctx.restore.console_domid,
+            ctx.restore.console_evtchn);
+
+    *console_mfn = ctx.restore.console_mfn;
+    *store_mfn = ctx.restore.xenstore_mfn;
+
+    return 0;
 }
 
 /*
-- 
1.7.10.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.