[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 6/6] tools/libxc: x86 pv restore implementation



Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
Signed-off-by: Frediano Ziglio <frediano.ziglio@xxxxxxxxxx>
---
 tools/libxc/saverestore/common.c         |   51 ++
 tools/libxc/saverestore/common.h         |   35 ++
 tools/libxc/saverestore/restore.c        |  112 +++-
 tools/libxc/saverestore/restore_x86_pv.c |  977 ++++++++++++++++++++++++++++++
 4 files changed, 1174 insertions(+), 1 deletion(-)
 create mode 100644 tools/libxc/saverestore/restore_x86_pv.c

diff --git a/tools/libxc/saverestore/common.c b/tools/libxc/saverestore/common.c
index df18447..dbfae21 100644
--- a/tools/libxc/saverestore/common.c
+++ b/tools/libxc/saverestore/common.c
@@ -84,6 +84,57 @@ int write_split_record(struct context *ctx, struct record 
*rec,
     return 0;
 }
 
+int read_record(struct context *ctx, struct record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct rhdr rhdr;
+    size_t datasz;
+
+    if ( read_exact(ctx->fd, &rhdr, sizeof rhdr) )
+    {
+        PERROR("Failed to read Record Header from stream");
+        return -1;
+    }
+    else if ( rhdr.length > REC_LENGTH_MAX )
+    {
+        ERROR("Record (0x%08"PRIx32", %s) length 0x%"PRIx32
+              " exceeds max (0x%"PRIx32")",
+              rhdr.type, rec_type_to_str(rhdr.type),
+              rhdr.length, REC_LENGTH_MAX);
+        return -1;
+    }
+
+    datasz = (rhdr.length + 7) & ~7U;
+
+    if ( datasz )
+    {
+        rec->data = malloc(datasz);
+
+        if ( !rec->data )
+        {
+            ERROR("Unable to allocate %zu bytes for record data 
(0x%08"PRIx32", %s)",
+                  datasz, rhdr.type, rec_type_to_str(rhdr.type));
+            return -1;
+        }
+
+        if ( read_exact(ctx->fd, rec->data, datasz) )
+        {
+            free(rec->data);
+            rec->data = NULL;
+            PERROR("Failed to read %zu bytes of data for record 
(0x%08"PRIx32", %s)",
+                   datasz, rhdr.type, rec_type_to_str(rhdr.type));
+            return -1;
+        }
+    }
+    else
+        rec->data = NULL;
+
+    rec->type   = rhdr.type;
+    rec->length = rhdr.length;
+
+    return 0;
+};
+
 /*
  * Local variables:
  * mode: C
diff --git a/tools/libxc/saverestore/common.h b/tools/libxc/saverestore/common.h
index a2c8cee..249e18f 100644
--- a/tools/libxc/saverestore/common.h
+++ b/tools/libxc/saverestore/common.h
@@ -7,9 +7,12 @@
 
 #include "../xg_private.h"
 #include "../xg_save_restore.h"
+#include "../xc_dom.h"
 
 #undef GET_FIELD
 #undef SET_FIELD
+#undef MEMCPY_FIELD
+#undef MEMSET_ARRAY_FIELD
 #undef mfn_to_pfn
 #undef pfn_to_mfn
 
@@ -95,6 +98,8 @@ struct context
 
 /* Saves an x86 PV domain. */
 int save_x86_pv(struct context *ctx);
+/* Restores an x86 PV domain. */
+int restore_x86_pv(struct context *ctx);
 
 struct record
 {
@@ -118,6 +123,22 @@ struct record
             (_p)->x32._f = (_v);                \
     })
 
+/* memcpy field _f from _s to _d, of an *_any union */
+#define MEMCPY_FIELD(_c, _d, _s, _f)                                    \
+    ({ if ( (_c)->x86_pv.width == 8 )                                   \
+            memcpy(&(_d)->x64._f, &(_s)->x64._f, sizeof((_d)->x64._f)); \
+        else                                                            \
+            memcpy(&(_d)->x32._f, &(_s)->x32._f, sizeof((_d)->x32._f)); \
+    })
+
+/* memset array field _f with value _v, from an *_any union */
+#define MEMSET_ARRAY_FIELD(_c, _d, _f, _v)                              \
+    ({ if ( (_c)->x86_pv.width == 8 )                                   \
+           memset(&(_d)->x64._f[0], (_v), sizeof((_d)->x64._f));        \
+       else                                                             \
+           memset(&(_d)->x32._f[0], (_v), sizeof((_d)->x32._f));        \
+    })
+
 /*
  * Writes a split record to the stream, applying correct padding where
  * appropriate.  It is common when sending records containing blobs from Xen
@@ -143,6 +164,20 @@ static inline int write_record(struct context *ctx, struct 
record *rec)
     return write_split_record(ctx, rec, NULL, 0);
 }
 
+/*
+ * Reads a record from the stream, and fills in the record structure.
+ *
+ * Returns 0 on success and non-0 on failure.
+ *
+ * On success, the records type and size shall be valid.
+ * - If size is 0, data shall be NULL.
+ * - If size is non-0, data shall be a buffer allocated by malloc() which must
+ *   be passed to free() by the caller.
+ *
+ * On failure, the contents of the record structure are undefined.
+ */
+int read_record(struct context *ctx, struct record *rec);
+
 #endif
 /*
  * Local variables:
diff --git a/tools/libxc/saverestore/restore.c 
b/tools/libxc/saverestore/restore.c
index 6624baa..6937aec 100644
--- a/tools/libxc/saverestore/restore.c
+++ b/tools/libxc/saverestore/restore.c
@@ -1,5 +1,62 @@
+#include <arpa/inet.h>
+
 #include "common.h"
 
+static int read_headers(struct context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    struct ihdr ihdr;
+    struct dhdr dhdr;
+
+    if ( read_exact(ctx->fd, &ihdr, sizeof ihdr) )
+    {
+        PERROR("Failed to read Image Header from stream");
+        return -1;
+    }
+
+    ihdr.id      = ntohl(ihdr.id);
+    ihdr.version = ntohl(ihdr.version);
+    ihdr.options = ntohs(ihdr.options);
+
+    if ( ihdr.marker != IHDR_MARKER )
+    {
+        ERROR("Invalid marker: Got 0x%016"PRIx64, ihdr.marker);
+        return -1;
+    }
+    else if ( ihdr.id != IHDR_ID )
+    {
+        ERROR("Invalid ID: Expected 0x%08"PRIx32", Got 0x%08"PRIx32,
+              IHDR_ID, ihdr.id);
+        return -1;
+    }
+    else if ( ihdr.version != IHDR_VERSION )
+    {
+        ERROR("Invalid Version: Expected %d, Got %d", ihdr.version, 
IHDR_VERSION);
+        return -1;
+    }
+    else if ( ihdr.options & IHDR_OPT_BIG_ENDIAN )
+    {
+        ERROR("Unable to handle big endian streams");
+        return -1;
+    }
+
+    ctx->restore.format_version = ihdr.version;
+
+    if ( read_exact(ctx->fd, &dhdr, sizeof dhdr) )
+    {
+        PERROR("Failed to read Domain Header from stream");
+        return -1;
+    }
+
+    ctx->restore.guest_type = dhdr.type;
+    ctx->restore.guest_page_size = (1U << dhdr.page_shift);
+
+    IPRINTF("Found %s domain from Xen %d.%d",
+            dhdr_type_to_str(dhdr.type), dhdr.xen_major, dhdr.xen_minor);
+    return 0;
+}
+
+
 int xc_domain_restore2(xc_interface *xch, int io_fd, uint32_t dom,
                        unsigned int store_evtchn, unsigned long *store_mfn,
                        domid_t store_domid, unsigned int console_evtchn,
@@ -8,8 +65,61 @@ int xc_domain_restore2(xc_interface *xch, int io_fd, uint32_t 
dom,
                        int checkpointed_stream,
                        struct restore_callbacks *callbacks)
 {
+    struct context ctx =
+        {
+            .xch = xch,
+            .fd = io_fd,
+        };
+
+    ctx.restore.console_evtchn = console_evtchn;
+    ctx.restore.console_domid = console_domid;
+    ctx.restore.xenstore_evtchn = store_evtchn;
+    ctx.restore.xenstore_domid = store_domid;
+
     IPRINTF("In experimental %s", __func__);
-    return -1;
+
+    if ( xc_domain_getinfo(xch, dom, 1, &ctx.dominfo) != 1 )
+    {
+        PERROR("Failed to get domain info");
+        return -1;
+    }
+
+    if ( ctx.dominfo.domid != dom )
+    {
+        ERROR("Domain %d does not exist", dom);
+        return -1;
+    }
+
+    ctx.domid = dom;
+    IPRINTF("Restoring domain %d", dom);
+
+    if ( read_headers(&ctx) )
+        return -1;
+
+    if ( ctx.dominfo.hvm )
+    {
+        ERROR("HVM Restore not supported yet");
+        return -1;
+    }
+    else
+    {
+        if ( restore_x86_pv(&ctx) )
+            return -1;
+
+        DPRINTF("XenStore: mfn %#lx, dom %d, evt %u",
+                ctx.restore.xenstore_mfn,
+                ctx.restore.xenstore_domid,
+                ctx.restore.xenstore_evtchn);
+
+        DPRINTF("Console: mfn %#lx, dom %d, evt %u",
+                ctx.restore.console_mfn,
+                ctx.restore.console_domid,
+                ctx.restore.console_evtchn);
+
+        *console_mfn = ctx.restore.console_mfn;
+        *store_mfn = ctx.restore.xenstore_mfn;
+        return 0;
+    }
 }
 
 /*
diff --git a/tools/libxc/saverestore/restore_x86_pv.c 
b/tools/libxc/saverestore/restore_x86_pv.c
new file mode 100644
index 0000000..0659244
--- /dev/null
+++ b/tools/libxc/saverestore/restore_x86_pv.c
@@ -0,0 +1,977 @@
+#include <assert.h>
+#include <arpa/inet.h>
+
+#include "common_x86_pv.h"
+
+static int expand_p2m(struct context *ctx, unsigned long max_pfn)
+{
+    xc_interface *xch = ctx->xch;
+    unsigned long old_max = ctx->x86_pv.max_pfn, i;
+    unsigned long end_frame = (max_pfn + ctx->x86_pv.fpp) / ctx->x86_pv.fpp;
+    unsigned long old_end_frame = (old_max + ctx->x86_pv.fpp) / 
ctx->x86_pv.fpp;
+    xen_pfn_t *p2m = NULL, *p2m_pfns = NULL;
+    uint32_t *pfn_types = NULL;
+    size_t p2msz, p2m_pfnsz, pfn_typesz;
+
+    /* We expect expand_p2m to be called exactly once, expanding from 0 the
+     * domains max, but assert some sanity */
+    assert(max_pfn > old_max);
+
+    p2msz = (max_pfn + 1) * ctx->x86_pv.width;
+    p2m = realloc(ctx->x86_pv.p2m, p2msz);
+    if ( !p2m )
+    {
+        ERROR("Failed to (re)alloc %zu bytes for p2m", p2msz);
+        return -1;
+    }
+    ctx->x86_pv.p2m = p2m;
+
+    pfn_typesz = (max_pfn + 1) * sizeof *pfn_types;
+    pfn_types = realloc(ctx->x86_pv.pfn_types, pfn_typesz);
+    if ( !pfn_types )
+    {
+        ERROR("Failed to (re)alloc %zu bytes for pfn_types", pfn_typesz);
+        return -1;
+    }
+    ctx->x86_pv.pfn_types = pfn_types;
+
+    p2m_pfnsz = (end_frame + 1) * sizeof *p2m_pfns;
+    p2m_pfns = realloc(ctx->x86_pv.p2m_pfns, p2m_pfnsz);
+    if ( !p2m_pfns )
+    {
+        ERROR("Failed to (re)alloc %zu bytes for p2m frame list", p2m_pfnsz);
+        return -1;
+    }
+    ctx->x86_pv.p2m_frames = end_frame;
+    ctx->x86_pv.p2m_pfns = p2m_pfns;
+
+    ctx->x86_pv.max_pfn = max_pfn;
+    for ( i = (old_max ? old_max + 1 : 0); i <= max_pfn; ++i )
+    {
+        set_p2m(ctx, i, INVALID_MFN);
+        ctx->x86_pv.pfn_types[i] = 0;
+    }
+
+    for ( i = (old_end_frame ? old_end_frame + 1 : 0); i <= end_frame; ++i )
+        ctx->x86_pv.p2m_pfns[i] = INVALID_MFN;
+
+    DPRINTF("Expanded p2m from %#lx to %#lx", old_max, max_pfn);
+    return 0;
+}
+
+static int pin_pagetables(struct context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    unsigned long i;
+    struct mmuext_op pin;
+
+    DPRINTF("Pinning pagetables");
+
+    for ( i = 0; i <= ctx->x86_pv.max_pfn; ++i )
+    {
+        if ( (ctx->x86_pv.pfn_types[i] & XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
+            continue;
+
+        switch ( ctx->x86_pv.pfn_types[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK )
+        {
+        case XEN_DOMCTL_PFINFO_L1TAB:
+            pin.cmd = MMUEXT_PIN_L1_TABLE;
+            break;
+        case XEN_DOMCTL_PFINFO_L2TAB:
+            pin.cmd = MMUEXT_PIN_L2_TABLE;
+            break;
+        case XEN_DOMCTL_PFINFO_L3TAB:
+            pin.cmd = MMUEXT_PIN_L3_TABLE;
+            break;
+        case XEN_DOMCTL_PFINFO_L4TAB:
+            pin.cmd = MMUEXT_PIN_L4_TABLE;
+            break;
+        default:
+            continue;
+        }
+
+        pin.arg1.mfn = pfn_to_mfn(ctx, i);
+
+        if ( xc_mmuext_op(xch, &pin, 1, ctx->domid) != 0 )
+        {
+            PERROR("Failed to pin page table for pfn %#lx", i);
+            return -1;
+        }
+
+    }
+
+    return 0;
+}
+
+static int process_start_info(struct context *ctx, vcpu_guest_context_any_t 
*vcpu)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t pfn, mfn;
+    start_info_any_t *guest_start_info = NULL;
+    int rc = -1;
+
+    pfn = GET_FIELD(ctx, vcpu, user_regs.edx);
+
+    if ( pfn > ctx->x86_pv.max_pfn )
+    {
+        ERROR("Start Info pfn %#lx out of range", pfn);
+        goto err;
+    }
+    else if ( ctx->x86_pv.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB )
+    {
+        ERROR("Start Info pfn %#lx has bad type %lu", pfn,
+              ctx->x86_pv.pfn_types[pfn] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT);
+        goto err;
+    }
+
+    mfn = pfn_to_mfn(ctx, pfn);
+    if ( !mfn_in_pseudophysmap(ctx, mfn) )
+    {
+        ERROR("Start Info has bad MFN");
+        pseudophysmap_walk(ctx, mfn);
+        goto err;
+    }
+
+    guest_start_info = xc_map_foreign_range(
+        xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
+    if ( !guest_start_info )
+    {
+        PERROR("Failed to map Start Info at mfn %#lx", mfn);
+        goto err;
+    }
+
+    /* Deal with xenstore stuff */
+    pfn = GET_FIELD(ctx, guest_start_info, store_mfn);
+    if ( pfn > ctx->x86_pv.max_pfn )
+    {
+        ERROR("XenStore pfn %#lx out of range", pfn);
+        goto err;
+    }
+
+    mfn = pfn_to_mfn(ctx, pfn);
+    if ( !mfn_in_pseudophysmap(ctx, mfn) )
+    {
+        ERROR("XenStore pfn has bad MFN");
+        pseudophysmap_walk(ctx, mfn);
+        goto err;
+    }
+
+    ctx->restore.xenstore_mfn = mfn;
+    SET_FIELD(ctx, guest_start_info, store_mfn, mfn);
+    SET_FIELD(ctx, guest_start_info, store_evtchn, 
ctx->restore.xenstore_evtchn);
+
+
+    /* Deal with console stuff */
+    pfn = GET_FIELD(ctx, guest_start_info, console.domU.mfn);
+    if ( pfn > ctx->x86_pv.max_pfn )
+    {
+        ERROR("Console pfn %#lx out of range", pfn);
+        goto err;
+    }
+
+    mfn = pfn_to_mfn(ctx, pfn);
+    if ( !mfn_in_pseudophysmap(ctx, mfn) )
+    {
+        ERROR("Console pfn has bad MFN");
+        pseudophysmap_walk(ctx, mfn);
+        goto err;
+    }
+
+    ctx->restore.console_mfn = mfn;
+    SET_FIELD(ctx, guest_start_info, console.domU.mfn, mfn);
+    SET_FIELD(ctx, guest_start_info, console.domU.evtchn, 
ctx->restore.console_evtchn);
+
+    /* Set other information */
+    SET_FIELD(ctx, guest_start_info, nr_pages, ctx->x86_pv.max_pfn + 1);
+    SET_FIELD(ctx, guest_start_info, shared_info,
+              ctx->dominfo.shared_info_frame << PAGE_SHIFT);
+    SET_FIELD(ctx, guest_start_info, flags, 0);
+
+    SET_FIELD(ctx, vcpu, user_regs.edx, mfn);
+    rc = 0;
+
+err:
+    if ( guest_start_info )
+        munmap(guest_start_info, PAGE_SIZE);
+
+    return rc;
+}
+
+static int update_guest_p2m(struct context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t mfn, pfn, *guest_p2m = NULL;
+    unsigned i;
+    int rc = -1;
+
+    for ( i = 0; i < ctx->x86_pv.p2m_frames; ++i )
+    {
+        pfn = ctx->x86_pv.p2m_pfns[i];
+
+        if ( pfn > ctx->x86_pv.max_pfn )
+        {
+            ERROR("pfn (%#lx) for p2m_frame_list[%u] out of range",
+                  pfn, i);
+            goto err;
+        }
+        else if ( ctx->x86_pv.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB )
+        {
+            ERROR("pfn (%#lx) for p2m_frame_list[%u] has bad type %lu", pfn, i,
+                  ctx->x86_pv.pfn_types[pfn] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT);
+            goto err;
+        }
+
+        mfn = pfn_to_mfn(ctx, pfn);
+        if ( !mfn_in_pseudophysmap(ctx, mfn) )
+        {
+            ERROR("p2m_frame_list[%u] has bad MFN", i);
+            pseudophysmap_walk(ctx, mfn);
+            goto err;
+        }
+
+        ctx->x86_pv.p2m_pfns[i] = mfn;
+    }
+
+    guest_p2m = xc_map_foreign_pages(xch, ctx->domid, PROT_WRITE,
+                                     ctx->x86_pv.p2m_pfns,
+                                     ctx->x86_pv.p2m_frames );
+    if ( !guest_p2m )
+    {
+        PERROR("Failed to map p2m frames");
+        goto err;
+    }
+
+    memcpy(guest_p2m, ctx->x86_pv.p2m,
+           (ctx->x86_pv.max_pfn + 1) * ctx->x86_pv.width);
+    rc = 0;
+ err:
+    if ( guest_p2m )
+        munmap(guest_p2m, ctx->x86_pv.p2m_frames * PAGE_SIZE);
+
+    return rc;
+}
+
+static int populate_pfn(struct context *ctx, xen_pfn_t pfn)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t mfn = pfn;
+    int rc;
+
+    if ( pfn_to_mfn(ctx, pfn) != INVALID_MFN )
+        return 0;
+
+    rc = xc_domain_populate_physmap_exact(xch, ctx->domid, 1, 0, 0, &mfn);
+    if ( rc )
+    {
+        ERROR("Failed to populate physmap");
+        return rc;
+    }
+
+    set_p2m(ctx, pfn, mfn);
+
+    /* This *really* should be true by now, or something has gone very wrong */
+    assert(mfn_in_pseudophysmap(ctx, mfn));
+
+    return 0;
+}
+
+static int localise_pagetable(struct context *ctx, uint64_t *table, xen_pfn_t 
type)
+{
+    xc_interface *xch = ctx->xch;
+    uint64_t pte;
+    unsigned i;
+
+    type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+    for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
+    {
+        pte = table[i];
+
+        if ( pte & _PAGE_PRESENT )
+        {
+            xen_pfn_t mfn, pfn;
+
+            pfn = pte_to_frame(ctx, pte);
+            mfn = pfn_to_mfn(ctx, pfn);
+
+            if ( mfn == INVALID_MFN )
+            {
+                if ( populate_pfn(ctx, pfn) )
+                    return -1;
+
+                mfn = pfn_to_mfn(ctx, pfn);
+            }
+
+            if ( !mfn_in_pseudophysmap(ctx, mfn) )
+            {
+                ERROR("Bad MFN for L%lu[%u]",
+                      type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i);
+                pseudophysmap_walk(ctx, mfn);
+                errno = ERANGE;
+                return -1;
+            }
+
+            update_pte(ctx, &pte, mfn);
+
+            table[i] = pte;
+        }
+    }
+
+    return 0;
+}
+
+static int handle_end(struct context *ctx, struct record *rec)
+{
+    xc_interface *xch = ctx->xch;
+
+    DPRINTF("End record");
+    return 0;
+}
+
+static int handle_page_data(struct context *ctx, struct record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct rec_page_data_header *page = rec->data;
+    xen_pfn_t mfn, pfn, type;
+    void *guest_page = NULL;
+    int rc = -1, err;
+
+    if ( rec->length < sizeof *page )
+    {
+        ERROR("PAGE_DATA record trucated: length %"PRIu32", min %zu",
+              rec->length, sizeof *page);
+        goto cleanup;
+    }
+    else if ( page->count != 1 )
+    {
+        // TODO
+        ERROR("Unable to handle batched pages (yet)");
+        goto cleanup;
+    }
+
+    pfn = page->pfn[0] & PAGE_DATA_PFN_MASK;
+    if ( pfn > ctx->x86_pv.max_pfn )
+    {
+        ERROR("pfn %#lx outside domain maximum (%#lx)", pfn, 
ctx->x86_pv.max_pfn);
+        goto cleanup;
+    }
+
+    type = (page->pfn[0] & PAGE_DATA_TYPE_MASK) >> 32;
+    if ( ((type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT) >= 5) &&
+         ((type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT) <= 8) )
+    {
+        ERROR("Invalid type %#lx for pfn %#lx", type, pfn);
+        goto cleanup;
+    }
+
+    ctx->x86_pv.pfn_types[pfn] = type;
+
+    switch ( type )
+    {
+        case XEN_DOMCTL_PFINFO_XTAB:
+        case XEN_DOMCTL_PFINFO_BROKEN:
+            /* No page data - leave alone */
+            rc = 0;
+            goto cleanup;
+    }
+
+    /* All other page types, need to allocate */
+    rc = populate_pfn(ctx, pfn);
+    if ( rc )
+        goto cleanup;
+
+    mfn = pfn_to_mfn(ctx, pfn);
+
+    guest_page = xc_map_foreign_bulk(
+        xch, ctx->domid, PROT_READ | PROT_WRITE, &mfn, &err, 1);
+    if ( !guest_page || err )
+    {
+        PERROR("Unable to map mfn %#lx (err %d)", mfn, err);
+        rc = -1;
+        goto cleanup;
+    }
+
+    /* XALLOC also has no page data */
+    if ( type != XEN_DOMCTL_PFINFO_XALLOC )
+        memcpy(guest_page, &page->pfn[1], PAGE_SIZE);
+
+    /* Pagetables need to be localised */
+    if ( ((type & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) >= XEN_DOMCTL_PFINFO_L1TAB 
&&
+          (type & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) <= XEN_DOMCTL_PFINFO_L4TAB) 
)
+    {
+        rc = localise_pagetable(ctx, guest_page, type);
+        if ( rc )
+            goto cleanup;
+    }
+
+    rc = 0;
+
+ cleanup:
+    if ( guest_page )
+        munmap(guest_page, PAGE_SIZE);
+
+    return rc;
+}
+
+static int handle_x86_pv_info(struct context *ctx, struct record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct rec_x86_pv_info *info = rec->data;
+
+    if ( rec->length < sizeof *info )
+    {
+        ERROR("X86_PV_INFO record trucated: length %"PRIu32", expected %zu",
+              rec->length, sizeof *info);
+        return -1;
+    }
+    else if ( info->guest_width != 4 &&
+              info->guest_width != 8 )
+    {
+        ERROR("Unexpected guest width %"PRIu32", Expected 4 or 8",
+              info->guest_width);
+        return -1;
+    }
+    else if ( info->guest_width != ctx->x86_pv.width )
+    {
+        int rc;
+        struct xen_domctl domctl;
+
+        /* try to set address size, domain is always created 64 bit */
+        memset(&domctl, 0, sizeof(domctl));
+        domctl.domain = ctx->domid;
+        domctl.cmd    = XEN_DOMCTL_set_address_size;
+        domctl.u.address_size.size = info->guest_width * 8;
+        rc = do_domctl(xch, &domctl);
+        if ( rc != 0 )
+        {
+            ERROR("Width of guest in stream (%"PRIu32
+                  " bits) differs with existing domain (%"PRIu32" bits)",
+                  info->guest_width * 8, ctx->x86_pv.width * 8);
+            return -1;
+        }
+
+        /* domain informations changed, better to refresh */
+        rc = x86_pv_domain_info(ctx);
+        if ( rc != 0 )
+        {
+            ERROR("Unable to refresh guest informations");
+            return -1;
+        }
+    }
+    else if ( info->pt_levels != 3 &&
+              info->pt_levels != 4 )
+    {
+        ERROR("Unexpected guest levels %"PRIu32", Expected 3 or 4",
+              info->pt_levels);
+        return -1;
+    }
+    else if ( info->pt_levels != ctx->x86_pv.levels )
+    {
+        ERROR("Levels of guest in stream (%"PRIu32
+              ") differs with existing domain (%"PRIu32")",
+              info->pt_levels, ctx->x86_pv.levels);
+        return -1;
+    }
+
+    DPRINTF("X86_PV_INFO record: %d bits, %d levels",
+            ctx->x86_pv.width * 8, ctx->x86_pv.levels);
+    return 0;
+}
+
+static int handle_x86_pv_p2m_frames(struct context *ctx, struct record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct rec_x86_pv_p2m_frames *data = rec->data;
+    unsigned start, end, x;
+    int rc;
+
+    if ( rec->length < sizeof *data )
+    {
+        ERROR("X86_PV_P2M_FRAMES record trucated: length %"PRIu32", min %zu",
+              rec->length, sizeof *data + sizeof(uint64_t));
+        return -1;
+    }
+    else if ( data->start_pfn > data->end_pfn )
+    {
+        ERROR("End pfn in stream (%#"PRIx32") exceeds Start (%#"PRIx32")",
+              data->end_pfn, data->start_pfn);
+        return -1;
+    }
+
+    start =  data->start_pfn / ctx->x86_pv.fpp;
+    end = data->end_pfn / ctx->x86_pv.fpp + 1;
+
+    if ( rec->length != sizeof *data + ((end - start) * sizeof (uint64_t)) )
+    {
+        ERROR("X86_PV_P2M_FRAMES record wrong size: start_pfn %#"PRIx32
+              ", end_pfn %#"PRIx32", length %"PRIu32
+              ", expected %zu + (%u - %u) * %zu",
+              data->start_pfn, data->end_pfn, rec->length,
+              sizeof *data, end, start, sizeof(uint64_t));
+        return -1;
+    }
+
+    if ( data->end_pfn > ctx->x86_pv.max_pfn )
+    {
+        rc = expand_p2m(ctx, data->end_pfn);
+        if ( rc )
+            return rc;
+    }
+
+    for ( x = 0; x <= (end - start); ++x )
+        ctx->x86_pv.p2m_pfns[start + x] = data->p2m_pfns[x];
+
+    DPRINTF("X86_PV_P2M_FRAMES record: GFNs %#"PRIx32"->%#"PRIx32,
+            data->start_pfn, data->end_pfn);
+    return 0;
+}
+
+static int handle_x86_pv_vcpu_basic(struct context *ctx, struct record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct rec_x86_pv_vcpu *vhdr = rec->data;
+    vcpu_guest_context_any_t vcpu;
+    size_t vcpusz = ctx->x86_pv.width == 8 ? sizeof vcpu.x64 : sizeof vcpu.x32;
+    xen_pfn_t pfn, mfn;
+    unsigned long tmp;
+    unsigned i;
+    int rc = -1;
+
+    if ( rec->length <= sizeof *vhdr )
+    {
+        ERROR("X86_PV_VCPU_BASIC record trucated: length %"PRIu32", min %zu",
+              rec->length, sizeof *vhdr + 1);
+        goto err;
+    }
+    else if ( rec->length != sizeof *vhdr + vcpusz )
+    {
+        ERROR("X86_PV_VCPU_EXTENDED record wrong size: length %"PRIu32
+              ", expected %zu", rec->length, sizeof *vhdr + vcpusz);
+        goto err;
+    }
+    else if ( vhdr->vcpu_id > ctx->dominfo.max_vcpu_id )
+    {
+        ERROR("X86_PV_VCPU_BASIC record vcpu_id (%"PRIu32
+              ") exceeds domain max (%u)",
+              vhdr->vcpu_id, ctx->dominfo.max_vcpu_id);
+        goto err;
+    }
+
+    memcpy(&vcpu, &vhdr->context, vcpusz);
+
+    SET_FIELD(ctx, &vcpu, flags, GET_FIELD(ctx, &vcpu, flags) | VGCF_online);
+
+    /* Vcpu 0 is special: Convert the suspend record to an MFN */
+    if ( vhdr->vcpu_id == 0 )
+    {
+        rc = process_start_info(ctx, &vcpu);
+        if ( rc )
+            return rc;
+        rc = -1;
+    }
+
+    tmp = GET_FIELD(ctx, &vcpu, gdt_ents);
+    if ( tmp > 8192 )
+    {
+        ERROR("GDT entry count (%lu) out of range", tmp);
+        errno = ERANGE;
+        goto err;
+    }
+
+    /* Convert GDT frames to MFNs */
+    for ( i = 0; (i * 512) < tmp; ++i )
+    {
+        pfn = GET_FIELD(ctx, &vcpu, gdt_frames[i]);
+        if ( pfn >= ctx->x86_pv.max_pfn )
+        {
+            ERROR("GDT frame %u (pfn %#lx) out of range", i, pfn);
+            goto err;
+        }
+        else if ( ctx->x86_pv.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB )
+        {
+            ERROR("GDT frame %u (pfn %#lx) has bad type %lu", i, pfn,
+                  ctx->x86_pv.pfn_types[pfn] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT);
+            goto err;
+        }
+
+        mfn = pfn_to_mfn(ctx, pfn);
+        if ( !mfn_in_pseudophysmap(ctx, mfn) )
+        {
+            ERROR("GDT frame %u has bad MFN", i);
+            pseudophysmap_walk(ctx, mfn);
+            goto err;
+        }
+
+        SET_FIELD(ctx, &vcpu, gdt_frames[i], mfn);
+    }
+
+    /* Convert CR3 to an MFN */
+    pfn = cr3_to_mfn(ctx, GET_FIELD(ctx, &vcpu, ctrlreg[3]));
+    if ( pfn >= ctx->x86_pv.max_pfn )
+    {
+        ERROR("cr3 (pfn %#lx) out of range", pfn);
+        goto err;
+    }
+    else if ( (ctx->x86_pv.pfn_types[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK ) 
!=
+              (((xen_pfn_t)ctx->x86_pv.levels) << 
XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
+    {
+        ERROR("cr3 (pfn %#lx) has bad type %lu, expected %lu", pfn,
+              ctx->x86_pv.pfn_types[pfn] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT,
+              ctx->x86_pv.levels);
+        goto err;
+    }
+
+    mfn = pfn_to_mfn(ctx, pfn);
+    if ( !mfn_in_pseudophysmap(ctx, mfn) )
+    {
+        ERROR("cr3 has bad MFN");
+        pseudophysmap_walk(ctx, mfn);
+        goto err;
+    }
+
+    SET_FIELD(ctx, &vcpu, ctrlreg[3], mfn_to_cr3(ctx, mfn));
+
+    /* 64bit guests: Convert CR1 (guest pagetables) to MFN */
+    if ( ctx->x86_pv.levels == 4 && (vcpu.x64.ctrlreg[1] & 1) )
+    {
+        pfn = vcpu.x64.ctrlreg[1] >> PAGE_SHIFT;
+
+        if ( pfn >= ctx->x86_pv.max_pfn )
+        {
+            ERROR("cr1 (pfn %#lx) out of range", pfn);
+            goto err;
+        }
+        else if ( (ctx->x86_pv.pfn_types[pfn] & 
XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
+                  (((xen_pfn_t)ctx->x86_pv.levels) << 
XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
+        {
+            ERROR("cr1 (pfn %#lx) has bad type %lu, expected %lu", pfn,
+                  ctx->x86_pv.pfn_types[pfn] >> XEN_DOMCTL_PFINFO_LTAB_SHIFT,
+                  ctx->x86_pv.levels);
+            goto err;
+        }
+
+        mfn = pfn_to_mfn(ctx, pfn);
+        if ( !mfn_in_pseudophysmap(ctx, mfn) )
+        {
+            ERROR("cr1 has bad MFN");
+            pseudophysmap_walk(ctx, mfn);
+            goto err;
+        }
+
+        vcpu.x64.ctrlreg[1] = (uint64_t)mfn << PAGE_SHIFT;
+    }
+
+    if ( xc_vcpu_setcontext(xch, ctx->domid, vhdr->vcpu_id, &vcpu) )
+    {
+        PERROR("Failed to set vcpu%"PRIu32"'s basic info", vhdr->vcpu_id);
+        goto err;
+    }
+
+    rc = 0;
+    DPRINTF("vcpu%d X86_PV_VCPU_BASIC record", vhdr->vcpu_id);
+ err:
+    return rc;
+}
+
+static int handle_x86_pv_vcpu_extended(struct context *ctx, struct record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct rec_x86_pv_vcpu *vcpu = rec->data;
+    DECLARE_DOMCTL;
+
+    if ( rec->length <= sizeof *vcpu )
+    {
+        ERROR("X86_PV_VCPU_EXTENDED record trucated: length %"PRIu32", min 
%zu",
+              rec->length, sizeof *vcpu + 1);
+        return -1;
+    }
+    else if ( rec->length > sizeof *vcpu + 128 )
+    {
+        ERROR("X86_PV_VCPU_EXTENDED record too long: length %"PRIu32", max 
%zu",
+              rec->length, sizeof *vcpu + 128);
+        return -1;
+    }
+    else if ( vcpu->vcpu_id > ctx->dominfo.max_vcpu_id )
+    {
+        ERROR("X86_PV_VCPU_EXTENDED record vcpu_id (%"PRIu32
+              ") exceeds domain max (%u)",
+              vcpu->vcpu_id, ctx->dominfo.max_vcpu_id);
+        return -1;
+    }
+
+    domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
+    domctl.domain = ctx->domid;
+    memcpy(&domctl.u.ext_vcpucontext, &vcpu->context, rec->length - sizeof 
*vcpu);
+
+    if ( xc_domctl(xch, &domctl) != 0 )
+    {
+        PERROR("Failed to set vcpu%"PRIu32"'s extended info", vcpu->vcpu_id);
+        return -1;
+    }
+
+    DPRINTF("vcpu%d X86_PV_VCPU_EXTENDED record", vcpu->vcpu_id);
+    return 0;
+}
+
+static int handle_x86_pv_vcpu_xsave(struct context *ctx, struct record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct rec_x86_pv_vcpu_xsave *vcpu = rec->data;
+    int rc;
+    DECLARE_DOMCTL;
+    DECLARE_HYPERCALL_BUFFER(void, buffer);
+    size_t buffersz;
+
+    if ( rec->length <= sizeof *vcpu )
+    {
+        ERROR("X86_PV_VCPU_XSAVE record trucated: length %"PRIu32", min %zu",
+              rec->length, sizeof *vcpu + 1);
+        return -1;
+    }
+    else if ( vcpu->vcpu_id > ctx->dominfo.max_vcpu_id )
+    {
+        ERROR("X86_PV_VCPU_EXTENDED record vcpu_id (%"PRIu32
+              ") exceeds domain max (%u)",
+              vcpu->vcpu_id, ctx->dominfo.max_vcpu_id);
+        return -1;
+    }
+
+    buffersz = rec->length - sizeof *vcpu;
+    buffer = xc_hypercall_buffer_alloc(xch, buffer, buffersz);
+    if ( !buffer )
+    {
+        ERROR("Unable to allocate %"PRIu64" bytes for xsave hypercall buffer",
+              buffersz);
+        return -1;
+    }
+
+    domctl.cmd = XEN_DOMCTL_setvcpuextstate;
+    domctl.domain = ctx->domid;
+    domctl.u.vcpuextstate.vcpu = vcpu->vcpu_id;
+    domctl.u.vcpuextstate.xfeature_mask = vcpu->xfeature_mask;
+    domctl.u.vcpuextstate.size = buffersz;
+    set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer);
+
+    rc = xc_domctl(xch, &domctl);
+
+    xc_hypercall_buffer_free(xch, buffer);
+
+    if ( rc )
+    {
+        PERROR("Failed to set vcpu%"PRIu32"'s xsave info", vcpu->vcpu_id);
+        return rc;
+    }
+    else
+    {
+        DPRINTF("vcpu%d X86_PV_VCPU_XSAVE record", vcpu->vcpu_id);
+        return 0;
+    }
+}
+
+static int handle_x86_pv_shared_info(struct context *ctx, struct record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    unsigned i;
+    int rc = -1;
+    shared_info_any_t *guest_shared_info = NULL;
+    shared_info_any_t *stream_shared_info = rec->data;
+
+    if ( rec->length != PAGE_SIZE )
+    {
+        ERROR("X86_PV_SHARED_INFO record wrong size: length %"PRIu32
+              ", expected %u", rec->length, PAGE_SIZE);
+        goto err;
+    }
+
+    guest_shared_info = xc_map_foreign_range(
+        xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE,
+        ctx->dominfo.shared_info_frame);
+    if ( !guest_shared_info )
+    {
+        PERROR("Failed to map Shared Info at mfn %#lx",
+               ctx->dominfo.shared_info_frame);
+        goto err;
+    }
+
+    MEMCPY_FIELD(ctx, guest_shared_info, stream_shared_info, vcpu_info);
+    MEMCPY_FIELD(ctx, guest_shared_info, stream_shared_info, arch);
+
+    SET_FIELD(ctx, guest_shared_info, arch.pfn_to_mfn_frame_list_list, 0);
+
+    MEMSET_ARRAY_FIELD(ctx, guest_shared_info, evtchn_pending, 0);
+    for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
+        SET_FIELD(ctx, guest_shared_info, vcpu_info[i].evtchn_pending_sel, 0);
+
+    MEMSET_ARRAY_FIELD(ctx, guest_shared_info, evtchn_mask, 0xff);
+
+    rc = 0;
+ err:
+
+    if ( guest_shared_info )
+        munmap(guest_shared_info, PAGE_SIZE);
+
+    return rc;
+}
+static int handle_tsc_info(struct context *ctx, struct record *rec)
+{
+    xc_interface *xch = ctx->xch;
+    struct rec_tsc_info *tsc = rec->data;
+
+    if ( rec->length != sizeof *tsc )
+    {
+        ERROR("TSC_INFO record wrong size: length %"PRIu32", expected %zu",
+              rec->length, sizeof *tsc);
+        return -1;
+    }
+
+    if ( xc_domain_set_tsc_info(xch, ctx->domid, tsc->mode,
+                                tsc->nsec, tsc->khz, tsc->incarnation) )
+    {
+        PERROR("Unable to set TSC information");
+        return -1;
+    }
+
+    return 0;
+}
+
+int restore_x86_pv(struct context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    struct record rec;
+    int rc;
+
+    IPRINTF("In experimental %s", __func__);
+
+    if ( ctx->restore.guest_type != DHDR_TYPE_x86_pv )
+    {
+        ERROR("Unable to restore %s domain into an x86_pv domain",
+              dhdr_type_to_str(ctx->restore.guest_type));
+        return -1;
+    }
+    else if ( ctx->restore.guest_page_size != 4096 )
+    {
+        ERROR("Invalid page size %d for x86_pv domains",
+              ctx->restore.guest_page_size);
+        return -1;
+    }
+
+    rc = x86_pv_domain_info(ctx);
+    if ( rc )
+        goto err;
+
+    rc = x86_pv_map_m2p(ctx);
+    if ( rc )
+        goto err;
+
+    do
+    {
+        rc = read_record(ctx, &rec);
+        if ( rc )
+            goto err;
+
+        switch ( rec.type )
+        {
+        case REC_TYPE_end:
+            rc = handle_end(ctx, &rec);
+            break;
+
+        case REC_TYPE_page_data:
+            rc = handle_page_data(ctx, &rec);
+            break;
+
+        case REC_TYPE_x86_pv_info:
+            rc = handle_x86_pv_info(ctx, &rec);
+            break;
+
+        case REC_TYPE_x86_pv_p2m_frames:
+            rc = handle_x86_pv_p2m_frames(ctx, &rec);
+            break;
+
+        case REC_TYPE_x86_pv_vcpu_basic:
+            rc = handle_x86_pv_vcpu_basic(ctx, &rec);
+            break;
+
+        case REC_TYPE_x86_pv_vcpu_extended:
+            rc = handle_x86_pv_vcpu_extended(ctx, &rec);
+            break;
+
+        case REC_TYPE_x86_pv_vcpu_xsave:
+            rc = handle_x86_pv_vcpu_xsave(ctx, &rec);
+            break;
+
+        case REC_TYPE_x86_pv_shared_info:
+            rc = handle_x86_pv_shared_info(ctx, &rec);
+            break;
+
+        case REC_TYPE_tsc_info:
+            rc = handle_tsc_info(ctx, &rec);
+            break;
+
+        default:
+            if ( rec.type & REC_TYPE_optional )
+            {
+                IPRINTF("Ignoring optional record (0x%"PRIx32", %s)",
+                        rec.type, rec_type_to_str(rec.type));
+                rc = 0;
+                break;
+            }
+
+            ERROR("Invalid record type (0x%"PRIx32", %s) for x86_pv domains",
+                  rec.type, rec_type_to_str(rec.type));
+            rc = -1;
+            break;
+        }
+
+        free(rec.data);
+        if ( rc )
+            goto err;
+
+    } while ( rec.type != REC_TYPE_end );
+
+    IPRINTF("Finished reading records");
+
+    rc = pin_pagetables(ctx);
+    if ( rc )
+        goto err;
+
+    rc = update_guest_p2m(ctx);
+    if ( rc )
+        goto err;
+
+    rc = xc_dom_gnttab_seed(xch, ctx->domid,
+                            ctx->restore.console_mfn,
+                            ctx->restore.xenstore_mfn,
+                            ctx->restore.console_domid,
+                            ctx->restore.xenstore_domid);
+    if ( rc )
+    {
+        PERROR("Failed to seed grant table");
+        goto err;
+    }
+
+    /* all done */
+    IPRINTF("All Done");
+    assert(!rc);
+    goto cleanup;
+
+ err:
+    assert(rc);
+ cleanup:
+
+    free(ctx->x86_pv.p2m_pfns);
+
+    if ( ctx->x86_pv.m2p )
+        munmap(ctx->x86_pv.m2p, ctx->x86_pv.nr_m2p_frames * PAGE_SIZE);
+
+    return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
-- 
1.7.10.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.