Xen project Mailing List

[Xen-devel] [PATCH v9 12/15] tools/libxc: noarch save code

From: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>

Date: Fri, 10 Apr 2015 18:16:04 +0100

Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Ian Jackson <Ian.Jackson@xxxxxxxxxxxxx>, Ian Campbell <Ian.Campbell@xxxxxxxxxx>, Wei Liu <wei.liu2@xxxxxxxxxx>

Delivery-date: Fri, 10 Apr 2015 17:18:08 +0000

List-id: Xen developer discussion <xen-devel.lists.xen.org>

Save a domain, calling domain type specific function at the appropriate points. This implements the xc_domain_save2() API function which is equivalent to the existing xc_domain_save(). This writes the image and domain headers, and writes all the PAGE_DATA records using a "live" process. Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> CC: Ian Campbell <Ian.Campbell@xxxxxxxxxx> CC: Ian Jackson <Ian.Jackson@xxxxxxxxxxxxx> CC: Wei Liu <wei.liu2@xxxxxxxxxx> --- tools/libxc/xc_sr_save.c | 777 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 776 insertions(+), 1 deletion(-) diff --git a/tools/libxc/xc_sr_save.c b/tools/libxc/xc_sr_save.c index 3c34a6b..c52fa33 100644 --- a/tools/libxc/xc_sr_save.c +++ b/tools/libxc/xc_sr_save.c @@ -1,11 +1,786 @@ +#include <assert.h> +#include <arpa/inet.h> + #include "xc_sr_common.h" +/* + * Writes an Image header and Domain header into the stream. + */ +static int write_headers(struct xc_sr_context *ctx, uint16_t guest_type) +{ + xc_interface *xch = ctx->xch; + int32_t xen_version = xc_version(xch, XENVER_version, NULL); + struct xc_sr_ihdr ihdr = + { + .marker = IHDR_MARKER, + .id = htonl(IHDR_ID), + .version = htonl(IHDR_VERSION), + .options = htons(IHDR_OPT_LITTLE_ENDIAN), + }; + struct xc_sr_dhdr dhdr = + { + .type = guest_type, + .page_shift = XC_PAGE_SHIFT, + .xen_major = (xen_version >> 16) & 0xffff, + .xen_minor = (xen_version) & 0xffff, + }; + + if ( xen_version < 0 ) + { + PERROR("Unable to obtain Xen Version"); + return -1; + } + + if ( write_exact(ctx->fd, &ihdr, sizeof(ihdr)) ) + { + PERROR("Unable to write Image Header to stream"); + return -1; + } + + if ( write_exact(ctx->fd, &dhdr, sizeof(dhdr)) ) + { + PERROR("Unable to write Domain Header to stream"); + return -1; + } + + return 0; +} + +/* + * Writes an END record into the stream. + */ +static int write_end_record(struct xc_sr_context *ctx) +{ + struct xc_sr_record end = { REC_TYPE_END, 0, NULL }; + + return write_record(ctx, &end); +} + +/* + * Writes a batch of memory as a PAGE_DATA record into the stream. The batch + * is constructed in ctx->save.batch_pfns. + * + * This function: + * - gets the types for each pfn in the batch. + * - for each pfn with real data: + * - maps and attempts to localise the pages. + * - construct and writes a PAGE_DATA record into the stream. + */ +static int write_batch(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t *mfns = NULL, *types = NULL; + void *guest_mapping = NULL; + void **guest_data = NULL; + void **local_pages = NULL; + int *errors = NULL, rc = -1; + unsigned i, p, nr_pages = 0; + unsigned nr_pfns = ctx->save.nr_batch_pfns; + void *page, *orig_page; + uint64_t *rec_pfns = NULL; + struct iovec *iov = NULL; int iovcnt = 0; + struct xc_sr_rec_page_data_header hdr = { 0 }; + struct xc_sr_record rec = + { + .type = REC_TYPE_PAGE_DATA, + }; + + assert(nr_pfns != 0); + + /* Mfns of the batch pfns. */ + mfns = malloc(nr_pfns * sizeof(*mfns)); + /* Types of the batch pfns. */ + types = malloc(nr_pfns * sizeof(*types)); + /* Errors from attempting to map the gfns. */ + errors = malloc(nr_pfns * sizeof(*errors)); + /* Pointers to page data to send. Mapped gfns or local allocations. */ + guest_data = calloc(nr_pfns, sizeof(*guest_data)); + /* Pointers to locally allocated pages. Need freeing. */ + local_pages = calloc(nr_pfns, sizeof(*local_pages)); + /* iovec[] for writev(). */ + iov = malloc((nr_pfns + 4) * sizeof(*iov)); + + if ( !mfns || !types || !errors || !guest_data || !local_pages || !iov ) + { + ERROR("Unable to allocate arrays for a batch of %u pages", + nr_pfns); + goto err; + } + + for ( i = 0; i < nr_pfns; ++i ) + { + types[i] = mfns[i] = ctx->save.ops.pfn_to_gfn(ctx, + ctx->save.batch_pfns[i]); + + /* Likely a ballooned page. */ + if ( mfns[i] == INVALID_MFN ) + { + set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages); + ++ctx->save.nr_deferred_pages; + } + } + + rc = xc_get_pfn_type_batch(xch, ctx->domid, nr_pfns, types); + if ( rc ) + { + PERROR("Failed to get types for pfn batch"); + goto err; + } + rc = -1; + + for ( i = 0; i < nr_pfns; ++i ) + { + switch ( types[i] ) + { + case XEN_DOMCTL_PFINFO_BROKEN: + case XEN_DOMCTL_PFINFO_XALLOC: + case XEN_DOMCTL_PFINFO_XTAB: + continue; + } + + mfns[nr_pages++] = mfns[i]; + } + + if ( nr_pages > 0 ) + { + guest_mapping = xc_map_foreign_bulk( + xch, ctx->domid, PROT_READ, mfns, errors, nr_pages); + if ( !guest_mapping ) + { + PERROR("Failed to map guest pages"); + goto err; + } + + for ( i = 0, p = 0; i < nr_pfns; ++i ) + { + switch ( types[i] ) + { + case XEN_DOMCTL_PFINFO_BROKEN: + case XEN_DOMCTL_PFINFO_XALLOC: + case XEN_DOMCTL_PFINFO_XTAB: + continue; + } + + if ( errors[p] ) + { + ERROR("Mapping of pfn %#lx (mfn %#lx) failed %d", + ctx->save.batch_pfns[i], mfns[p], errors[p]); + goto err; + } + + orig_page = page = guest_mapping + (p * PAGE_SIZE); + rc = ctx->save.ops.normalise_page(ctx, types[i], &page); + + if ( orig_page != page ) + local_pages[i] = page; + + if ( rc ) + { + if ( rc == -1 && errno == EAGAIN ) + { + set_bit(ctx->save.batch_pfns[i], ctx->save.deferred_pages); + ++ctx->save.nr_deferred_pages; + types[i] = XEN_DOMCTL_PFINFO_XTAB; + --nr_pages; + } + else + goto err; + } + else + guest_data[i] = page; + + rc = -1; + ++p; + } + } + + rec_pfns = malloc(nr_pfns * sizeof(*rec_pfns)); + if ( !rec_pfns ) + { + ERROR("Unable to allocate %zu bytes of memory for page data pfn list", + nr_pfns * sizeof(*rec_pfns)); + goto err; + } + + hdr.count = nr_pfns; + + rec.length = sizeof(hdr); + rec.length += nr_pfns * sizeof(*rec_pfns); + rec.length += nr_pages * PAGE_SIZE; + + for ( i = 0; i < nr_pfns; ++i ) + rec_pfns[i] = ((uint64_t)(types[i]) << 32) | ctx->save.batch_pfns[i]; + + iov[0].iov_base = &rec.type; + iov[0].iov_len = sizeof(rec.type); + + iov[1].iov_base = &rec.length; + iov[1].iov_len = sizeof(rec.length); + + iov[2].iov_base = &hdr; + iov[2].iov_len = sizeof(hdr); + + iov[3].iov_base = rec_pfns; + iov[3].iov_len = nr_pfns * sizeof(*rec_pfns); + + iovcnt = 4; + + if ( nr_pages ) + { + for ( i = 0; i < nr_pfns; ++i ) + { + if ( guest_data[i] ) + { + iov[iovcnt].iov_base = guest_data[i]; + iov[iovcnt].iov_len = PAGE_SIZE; + iovcnt++; + --nr_pages; + } + } + } + + if ( writev_exact(ctx->fd, iov, iovcnt) ) + { + PERROR("Failed to write page data to stream"); + goto err; + } + + /* Sanity check we have sent all the pages we expected to. */ + assert(nr_pages == 0); + rc = ctx->save.nr_batch_pfns = 0; + + err: + free(rec_pfns); + if ( guest_mapping ) + munmap(guest_mapping, nr_pages * PAGE_SIZE); + for ( i = 0; local_pages && i < nr_pfns; ++i ) + free(local_pages[i]); + free(iov); + free(local_pages); + free(guest_data); + free(errors); + free(types); + free(mfns); + + return rc; +} + +/* + * Flush a batch of pfns into the stream. + */ +static int flush_batch(struct xc_sr_context *ctx) +{ + int rc = 0; + + if ( ctx->save.nr_batch_pfns == 0 ) + return rc; + + rc = write_batch(ctx); + + if ( !rc ) + { + VALGRIND_MAKE_MEM_UNDEFINED(ctx->save.batch_pfns, + MAX_BATCH_SIZE * + sizeof(*ctx->save.batch_pfns)); + } + + return rc; +} + +/* + * Add a single pfn to the batch, flushing the batch if full. + */ +static int add_to_batch(struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + int rc = 0; + + if ( ctx->save.nr_batch_pfns == MAX_BATCH_SIZE ) + rc = flush_batch(ctx); + + if ( rc == 0 ) + ctx->save.batch_pfns[ctx->save.nr_batch_pfns++] = pfn; + + return rc; +} + +/* + * Pause/suspend the domain, and refresh ctx->dominfo if required. + */ +static int suspend_domain(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + + /* TODO: Properly specify the return value from this callback. All + * implementations currently appear to return 1 for success, whereas + * the legacy code checks for != 0. */ + int cb_rc = ctx->save.callbacks->suspend(ctx->save.callbacks->data); + + if ( cb_rc == 0 ) + { + ERROR("save callback suspend() failed: %d", cb_rc); + return -1; + } + + /* Refresh domain information. */ + if ( (xc_domain_getinfo(xch, ctx->domid, 1, &ctx->dominfo) != 1) || + (ctx->dominfo.domid != ctx->domid) ) + { + PERROR("Unable to refresh domain information"); + return -1; + } + + /* Confirm the domain has actually been paused. */ + if ( !ctx->dominfo.shutdown || + (ctx->dominfo.shutdown_reason != SHUTDOWN_suspend) ) + { + ERROR("Domain has not been suspended: shutdown %d, reason %d", + ctx->dominfo.shutdown, ctx->dominfo.shutdown_reason); + return -1; + } + + xc_report_progress_single(xch, "Domain now suspended"); + + return 0; +} + +/* + * Send all pages in the guests p2m. Used as the first iteration of the live + * migration loop, and for a non-live save. + */ +static int send_all_pages(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t p; + int rc; + + for ( p = 0; p < ctx->save.p2m_size; ++p ) + { + rc = add_to_batch(ctx, p); + if ( rc ) + return rc; + + /* Update progress every 4MB worth of memory sent. */ + if ( (p & ((1U << (22 - 12)) - 1)) == 0 ) + xc_report_progress_step(xch, p, ctx->save.p2m_size); + } + + rc = flush_batch(ctx); + if ( rc ) + return rc; + + xc_report_progress_step(xch, ctx->save.p2m_size, + ctx->save.p2m_size); + return 0; +} + +/* + * Send a subset of pages in the guests p2m, according to the provided bitmap. + * Used for each subsequent iteration of the live migration loop. + * + * Bitmap is bounded by p2m_size. + */ +static int send_some_pages(struct xc_sr_context *ctx, + unsigned long *bitmap, + unsigned long entries) +{ + xc_interface *xch = ctx->xch; + xen_pfn_t p; + unsigned long written; + int rc; + + for ( p = 0, written = 0; p < ctx->save.p2m_size; ++p ) + { + if ( !test_bit(p, bitmap) ) + continue; + + rc = add_to_batch(ctx, p); + if ( rc ) + return rc; + + /* Update progress every 4MB worth of memory sent. */ + if ( (written & ((1U << (22 - 12)) - 1)) == 0 ) + xc_report_progress_step(xch, written, entries); + + ++written; + } + + rc = flush_batch(ctx); + if ( rc ) + return rc; + + if ( written > entries ) + DPRINTF("Bitmap contained more entries than expected..."); + + xc_report_progress_step(xch, entries, entries); + return 0; +} + +static int enable_logdirty(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + int on1 = 0, off = 0, on2 = 0; + int rc; + + /* This juggling is required if logdirty is enabled for VRAM tracking. */ + rc = xc_shadow_control(xch, ctx->domid, + XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, + NULL, 0, NULL, 0, NULL); + if ( rc < 0 ) + { + on1 = errno; + rc = xc_shadow_control(xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_OFF, + NULL, 0, NULL, 0, NULL); + if ( rc < 0 ) + off = errno; + else { + rc = xc_shadow_control(xch, ctx->domid, + XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, + NULL, 0, NULL, 0, NULL); + if ( rc < 0 ) + on2 = errno; + } + if ( rc < 0 ) + { + PERROR("Failed to enable logdirty: %d,%d,%d", on1, off, on2); + return rc; + } + } + + return 0; +} + +static int update_progress_string(struct xc_sr_context *ctx, + char **str, unsigned iter) +{ + xc_interface *xch = ctx->xch; + char *new_str = NULL; + + if ( asprintf(&new_str, "Memory iteration %u of %u", + iter, ctx->save.max_iterations) == -1 ) + { + PERROR("Unable to allocate new progress string"); + return -1; + } + + free(*str); + *str = new_str; + + xc_set_progress_prefix(xch, *str); + return 0; +} + +/* + * Send all domain memory. This is the heart of the live migration loop. + */ +static int send_domain_memory_live(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + DECLARE_HYPERCALL_BUFFER(unsigned long, to_send); + xc_shadow_op_stats_t stats = { 0, ctx->save.p2m_size }; + char *progress_str = NULL; + unsigned x; + int rc = -1; + + to_send = xc_hypercall_buffer_alloc_pages( + xch, to_send, NRPAGES(bitmap_size(ctx->save.p2m_size))); + + ctx->save.batch_pfns = malloc(MAX_BATCH_SIZE * + sizeof(*ctx->save.batch_pfns)); + ctx->save.deferred_pages = calloc(1, bitmap_size(ctx->save.p2m_size)); + + if ( !ctx->save.batch_pfns || !to_send || !ctx->save.deferred_pages ) + { + ERROR("Unable to allocate memory for to_{send,fix}/batch bitmaps"); + goto out; + } + + rc = enable_logdirty(ctx); + if ( rc ) + goto out; + + rc = update_progress_string(ctx, &progress_str, 0); + if ( rc ) + goto out; + + rc = send_all_pages(ctx); + if ( rc ) + goto out; + + for ( x = 1; + ((x < ctx->save.max_iterations) && + (stats.dirty_count > ctx->save.dirty_threshold)); ++x ) + { + if ( xc_shadow_control( + xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN, + HYPERCALL_BUFFER(to_send), ctx->save.p2m_size, + NULL, 0, &stats) != ctx->save.p2m_size ) + { + PERROR("Failed to retrieve logdirty bitmap"); + rc = -1; + goto out; + } + + if ( stats.dirty_count == 0 ) + break; + + rc = update_progress_string(ctx, &progress_str, x); + if ( rc ) + goto out; + + rc = send_some_pages(ctx, to_send, stats.dirty_count); + if ( rc ) + goto out; + } + + rc = suspend_domain(ctx); + if ( rc ) + goto out; + + if ( xc_shadow_control( + xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN, + HYPERCALL_BUFFER(to_send), ctx->save.p2m_size, + NULL, 0, &stats) != ctx->save.p2m_size ) + { + PERROR("Failed to retrieve logdirty bitmap"); + rc = -1; + goto out; + } + + rc = update_progress_string(ctx, &progress_str, ctx->save.max_iterations); + if ( rc ) + goto out; + + bitmap_or(to_send, ctx->save.deferred_pages, ctx->save.p2m_size); + + rc = send_some_pages(ctx, to_send, + stats.dirty_count + ctx->save.nr_deferred_pages); + if ( rc ) + goto out; + + if ( ctx->save.debug ) + { + struct xc_sr_record rec = + { + .type = REC_TYPE_VERIFY, + .length = 0, + }; + + DPRINTF("Enabling verify mode"); + + rc = write_record(ctx, &rec); + if ( rc ) + goto out; + + xc_set_progress_prefix(xch, "Memory verify"); + rc = send_all_pages(ctx); + if ( rc ) + goto out; + + if ( xc_shadow_control( + xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_PEEK, + HYPERCALL_BUFFER(to_send), ctx->save.p2m_size, + NULL, 0, &stats) != ctx->save.p2m_size ) + { + PERROR("Failed to retrieve logdirty bitmap"); + rc = -1; + goto out; + } + + DPRINTF(" Further stats: faults %u, dirty %u", + stats.fault_count, stats.dirty_count); + } + + out: + xc_set_progress_prefix(xch, NULL); + free(progress_str); + xc_hypercall_buffer_free_pages(xch, to_send, + NRPAGES(bitmap_size(ctx->save.p2m_size))); + free(ctx->save.deferred_pages); + free(ctx->save.batch_pfns); + return rc; +} + +/* + * Send all domain memory, pausing the domain first. Generally used for + * suspend-to-file. + */ +static int send_domain_memory_nonlive(struct xc_sr_context *ctx) +{ + xc_interface *xch = ctx->xch; + int rc = -1; + + ctx->save.batch_pfns = malloc(MAX_BATCH_SIZE * + sizeof(*ctx->save.batch_pfns)); + ctx->save.deferred_pages = calloc(1, bitmap_size(ctx->save.p2m_size)); + + if ( !ctx->save.batch_pfns || !ctx->save.deferred_pages ) + { + PERROR("Failed to allocate memory for nonlive tracking structures"); + errno = ENOMEM; + goto err; + } + + rc = suspend_domain(ctx); + if ( rc ) + goto err; + + xc_set_progress_prefix(xch, "Memory"); + + rc = send_all_pages(ctx); + if ( rc ) + goto err; + + err: + free(ctx->save.deferred_pages); + free(ctx->save.batch_pfns); + + return rc; +} + +/* + * Save a domain. + */ +static int save(struct xc_sr_context *ctx, uint16_t guest_type) +{ + xc_interface *xch = ctx->xch; + int rc, saved_rc = 0, saved_errno = 0; + + IPRINTF("Saving domain %d, type %s", + ctx->domid, dhdr_type_to_str(guest_type)); + + rc = ctx->save.ops.setup(ctx); + if ( rc ) + goto err; + + xc_report_progress_single(xch, "Start of stream"); + + rc = write_headers(ctx, guest_type); + if ( rc ) + goto err; + + rc = ctx->save.ops.start_of_stream(ctx); + if ( rc ) + goto err; + + if ( ctx->save.live ) + rc = send_domain_memory_live(ctx); + else + rc = send_domain_memory_nonlive(ctx); + + if ( rc ) + goto err; + + if ( !ctx->dominfo.shutdown || + (ctx->dominfo.shutdown_reason != SHUTDOWN_suspend) ) + { + ERROR("Domain has not been suspended"); + rc = -1; + goto err; + } + + xc_report_progress_single(xch, "End of stream"); + + rc = ctx->save.ops.end_of_stream(ctx); + if ( rc ) + goto err; + + rc = write_end_record(ctx); + if ( rc ) + goto err; + + xc_report_progress_single(xch, "Complete"); + goto done; + + err: + saved_errno = errno; + saved_rc = rc; + PERROR("Save failed"); + + done: + xc_shadow_control(xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_OFF, + NULL, 0, NULL, 0, NULL); + + rc = ctx->save.ops.cleanup(ctx); + if ( rc ) + PERROR("Failed to clean up"); + + if ( saved_rc ) + { + rc = saved_rc; + errno = saved_errno; + } + + return rc; +}; + int xc_domain_save2(xc_interface *xch, int io_fd, uint32_t dom, uint32_t max_iters, uint32_t max_factor, uint32_t flags, struct save_callbacks* callbacks, int hvm) { + xen_pfn_t nr_pfns; + struct xc_sr_context ctx = + { + .xch = xch, + .fd = io_fd, + }; + + /* GCC 4.4 (of CentOS 6.x vintage) can' t initialise anonymous unions :( */ + ctx.save.callbacks = callbacks; + ctx.save.live = !!(flags & XCFLAGS_LIVE); + ctx.save.debug = !!(flags & XCFLAGS_DEBUG); + + /* + * TODO: Find some time to better tweak the live migration algorithm. + * + * These parameters are better than the legacy algorithm especially for + * busy guests. + */ + ctx.save.max_iterations = 5; + ctx.save.dirty_threshold = 50; + IPRINTF("In experimental %s", __func__); - return -1; + DPRINTF("fd %d, dom %u, max_iters %u, max_factor %u, flags %u, hvm %d", + io_fd, dom, max_iters, max_factor, flags, hvm); + + if ( xc_domain_getinfo(xch, dom, 1, &ctx.dominfo) != 1 ) + { + PERROR("Failed to get domain info"); + return -1; + } + + if ( ctx.dominfo.domid != dom ) + { + ERROR("Domain %u does not exist", dom); + return -1; + } + + ctx.domid = dom; + + if ( xc_domain_nr_gpfns(xch, dom, &nr_pfns) < 0 ) + { + PERROR("Unable to obtain the guest p2m size"); + return -1; + } + + ctx.save.p2m_size = nr_pfns; + + if ( ctx.save.p2m_size > ~XEN_DOMCTL_PFINFO_LTAB_MASK ) + { + errno = E2BIG; + ERROR("Cannot save this big a guest"); + return -1; + } + + if ( ctx.dominfo.hvm ) + { + ctx.save.ops = save_ops_x86_hvm; + return save(&ctx, DHDR_TYPE_X86_HVM); + } + else + { + ctx.save.ops = save_ops_x86_pv; + return save(&ctx, DHDR_TYPE_X86_PV); + } } /* -- 1.7.10.4 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.