[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] superpages lost after migration of HVM domU
On Thu, Apr 20, Jan Beulich wrote: > >>> On 20.04.17 at 18:04, <olaf@xxxxxxxxx> wrote: > > On Thu, Apr 20, Andrew Cooper wrote: > > > >> As it currently stands, the sending side iterates from 0 to p2m_size, > >> and sends every frame on the first pass. This means we get PAGE_DATA > >> records linearly, in batches of 1024, or two aligned 2M superpages. > > Is there a way to preserve 1G pages? This 380G domU I'm looking at is > > built with 4k:461390 2M:2341 1G:365 pages. > I think we've hashed out a possible way to deal with this, by > speculatively allocating 1G pages as long as the allocation cap for > the domain allows, subsequently punching holes into those pages > if we can't allocate any new pages anymore (due to otherwise > overrunning the cap). The result is not pretty. This HVM-only approach appears to work for a domU with "memory=3024" and localhost migration. It is required to punch holes as soon as possible to avoid errors in xenforeignmemory_map due to "Over-allocation". Would be nice if the receiver gets a memory map upfront to avoid all stunts... Olaf diff --git a/tools/libxc/xc_sr_common.h b/tools/libxc/xc_sr_common.h index a83f22af4e..36e7891dde 100644 --- a/tools/libxc/xc_sr_common.h +++ b/tools/libxc/xc_sr_common.h @@ -107,6 +107,9 @@ struct xc_sr_save_ops */ struct xc_sr_restore_ops { + /* Allocate a MFN for the given PFN */ + int (*allocate_pfn)(struct xc_sr_context *ctx, xen_pfn_t pfn); + /* Convert a PFN to GFN. May return ~0UL for an invalid mapping. */ xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn); @@ -331,6 +334,14 @@ struct xc_sr_context /* HVM context blob. */ void *context; size_t contextsz; + + /* Bitmap of currently allocated PFNs during restore. */ + xen_pfn_t *sp_extents; + unsigned long *attempted_1g; + unsigned long *attempted_2m; + unsigned long *allocated_pfns; + xen_pfn_t max_allocated_pfn; + unsigned long alloc_cnt; } restore; }; } x86_hvm; diff --git a/tools/libxc/xc_sr_restore.c b/tools/libxc/xc_sr_restore.c index 3549f0a1ae..2e8d15307f 100644 --- a/tools/libxc/xc_sr_restore.c +++ b/tools/libxc/xc_sr_restore.c @@ -135,6 +135,7 @@ int populate_pfns(struct xc_sr_context *ctx, unsigned count, const xen_pfn_t *original_pfns, const uint32_t *types) { xc_interface *xch = ctx->xch; + xen_pfn_t min_pfn = original_pfns[0], max_pfn = original_pfns[0]; xen_pfn_t *mfns = malloc(count * sizeof(*mfns)), *pfns = malloc(count * sizeof(*pfns)); unsigned i, nr_pfns = 0; @@ -149,11 +150,18 @@ int populate_pfns(struct xc_sr_context *ctx, unsigned count, for ( i = 0; i < count; ++i ) { + if (original_pfns[i] < min_pfn) + min_pfn = original_pfns[i]; + if (original_pfns[i] > max_pfn) + max_pfn = original_pfns[i]; if ( (!types || (types && (types[i] != XEN_DOMCTL_PFINFO_XTAB && types[i] != XEN_DOMCTL_PFINFO_BROKEN))) && !pfn_is_populated(ctx, original_pfns[i]) ) { + rc = ctx->restore.ops.allocate_pfn(ctx, original_pfns[i]); + if ( rc ) + goto err; rc = pfn_set_populated(ctx, original_pfns[i]); if ( rc ) goto err; @@ -161,6 +169,16 @@ int populate_pfns(struct xc_sr_context *ctx, unsigned count, ++nr_pfns; } } + IPRINTF("checking range %lx %lx\n", min_pfn, max_pfn); + while (min_pfn < max_pfn) { + if (!pfn_is_populated(ctx, min_pfn) && test_and_clear_bit(min_pfn, ctx->x86_hvm.restore.allocated_pfns)) { + xen_pfn_t pfn = min_pfn; + rc = xc_domain_decrease_reservation_exact(xch, ctx->domid, 1, 0, &pfn); + IPRINTF("free %lx %lx %d\n", min_pfn, pfn, rc); + } + min_pfn++; + } + nr_pfns = 0; if ( nr_pfns ) { @@ -723,6 +741,10 @@ static void cleanup(struct xc_sr_context *ctx) NRPAGES(bitmap_size(ctx->restore.p2m_size))); free(ctx->restore.buffered_records); free(ctx->restore.populated_pfns); + free(ctx->x86_hvm.restore.sp_extents); + free(ctx->x86_hvm.restore.attempted_1g); + free(ctx->x86_hvm.restore.attempted_2m); + free(ctx->x86_hvm.restore.allocated_pfns); if ( ctx->restore.ops.cleanup(ctx) ) PERROR("Failed to clean up"); } @@ -810,6 +832,17 @@ static int restore(struct xc_sr_context *ctx) saved_errno = errno; saved_rc = rc; PERROR("Restore failed"); + { + unsigned long i; + bool a, p; + IPRINTF("alloc_cnt %lu\n", ctx->x86_hvm.restore.alloc_cnt); + for (i = 0; i < ctx->restore.p2m_size; i++) { + p = test_bit(i, ctx->restore.populated_pfns); + a = test_bit(i, ctx->x86_hvm.restore.allocated_pfns); + if (p != a) + IPRINTF("%lx a %x p %x\n", i, a, p); + } + } done: cleanup(ctx); @@ -888,6 +921,7 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom, } ctx.restore.p2m_size = nr_pfns; + IPRINTF("p2m_size %lx\n", ctx.restore.p2m_size); if ( ctx.dominfo.hvm ) { diff --git a/tools/libxc/xc_sr_restore_x86_hvm.c b/tools/libxc/xc_sr_restore_x86_hvm.c index 1dca85354a..fc441d2a6d 100644 --- a/tools/libxc/xc_sr_restore_x86_hvm.c +++ b/tools/libxc/xc_sr_restore_x86_hvm.c @@ -3,6 +3,10 @@ #include "xc_sr_common_x86.h" +#define SUPERPAGE_2MB_SHIFT 9 +#define SUPERPAGE_2MB_NR_PFNS (1UL << SUPERPAGE_2MB_SHIFT) +#define SUPERPAGE_1GB_SHIFT 18 +#define SUPERPAGE_1GB_NR_PFNS (1UL << SUPERPAGE_1GB_SHIFT) /* * Process an HVM_CONTEXT record from the stream. */ @@ -149,6 +153,20 @@ static int x86_hvm_setup(struct xc_sr_context *ctx) return -1; } + ctx->x86_hvm.restore.sp_extents = calloc(1UL << SUPERPAGE_1GB_SHIFT, sizeof(*ctx->x86_hvm.restore.sp_extents)); + ctx->x86_hvm.restore.attempted_1g = bitmap_alloc((ctx->restore.p2m_size >> SUPERPAGE_1GB_SHIFT) + 1); + ctx->x86_hvm.restore.attempted_2m = bitmap_alloc((ctx->restore.p2m_size >> SUPERPAGE_2MB_SHIFT) + 1); + ctx->x86_hvm.restore.max_allocated_pfn = ctx->restore.p2m_size; + ctx->x86_hvm.restore.allocated_pfns = bitmap_alloc(ctx->x86_hvm.restore.max_allocated_pfn + 1); + if (!ctx->x86_hvm.restore.sp_extents || !ctx->x86_hvm.restore.allocated_pfns || !ctx->x86_hvm.restore.attempted_2m || !ctx->x86_hvm.restore.attempted_1g) + { + ERROR("Unable to allocate memory for allocated_pfns bitmaps"); + return -1; + } + /* No superpage in 1st 2MB due to VGA hole */ + set_bit(0, ctx->x86_hvm.restore.attempted_1g); + set_bit(0, ctx->x86_hvm.restore.attempted_2m); + return 0; } @@ -228,8 +246,139 @@ static int x86_hvm_cleanup(struct xc_sr_context *ctx) return 0; } +static bool pfn_is_allocated(const struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + if ( pfn > ctx->x86_hvm.restore.max_allocated_pfn ) + return false; + return test_bit(pfn, ctx->x86_hvm.restore.allocated_pfns); +} + +/* + * Set a pfn as allocated, expanding the tracking structures if needed. To + * avoid realloc()ing too excessively, the size increased to the nearest power + * of two large enough to contain the required pfn. + */ +static int pfn_set_allocated(struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + xc_interface *xch = ctx->xch; + + if ( pfn > ctx->x86_hvm.restore.max_allocated_pfn ) + { + xen_pfn_t new_max; + size_t old_sz, new_sz; + unsigned long *p; + + /* Round up to the nearest power of two larger than pfn, less 1. */ + new_max = pfn; + new_max |= new_max >> 1; + new_max |= new_max >> 2; + new_max |= new_max >> 4; + new_max |= new_max >> 8; + new_max |= new_max >> 16; +#ifdef __x86_64__ + new_max |= new_max >> 32; +#endif + + old_sz = bitmap_size(ctx->x86_hvm.restore.max_allocated_pfn + 1); + new_sz = bitmap_size(new_max + 1); + p = realloc(ctx->x86_hvm.restore.allocated_pfns, new_sz); + if ( !p ) + { + ERROR("Failed to realloc allocated bitmap"); + errno = ENOMEM; + return -1; + } + + memset((uint8_t *)p + old_sz, 0x00, new_sz - old_sz); + + ctx->x86_hvm.restore.allocated_pfns = p; + ctx->x86_hvm.restore.max_allocated_pfn = new_max; + } + + assert(!test_bit(pfn, ctx->x86_hvm.restore.allocated_pfns)); + set_bit(pfn, ctx->x86_hvm.restore.allocated_pfns); + + return 0; +} + +static int x86_hvm_allocate_pfn(struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + xc_interface *xch = ctx->xch; + bool success = false; + int rc = -1; + long done; + unsigned long i, nr_extents; + unsigned long stat_1g = 0, stat_2m = 0, stat_4k = 0; + unsigned long idx_1g, idx_2m; + unsigned long count; + xen_pfn_t base_pfn = 0, *sp_extents = ctx->x86_hvm.restore.sp_extents; + + IPRINTF("pfn %lx\n", (long)pfn); + if (pfn_is_allocated(ctx, pfn)) + return 0; + + idx_1g = pfn >> SUPERPAGE_1GB_SHIFT; + idx_2m = pfn >> SUPERPAGE_2MB_SHIFT; + IPRINTF("idx_1g %lu idx_2m %lu\n", idx_1g, idx_2m); + if (!test_and_set_bit(idx_1g, ctx->x86_hvm.restore.attempted_1g)) { + count = 1UL << SUPERPAGE_1GB_SHIFT; + base_pfn = (pfn >> SUPERPAGE_1GB_SHIFT) << SUPERPAGE_1GB_SHIFT; + nr_extents = count >> SUPERPAGE_1GB_SHIFT; + IPRINTF("base_pfn %lx count %lu nr_extents %lu\n", (long)base_pfn, count, nr_extents); + for ( i = 0; i < nr_extents; i++ ) + sp_extents[i] = base_pfn + (i<<SUPERPAGE_1GB_SHIFT); + done = xc_domain_populate_physmap(xch, ctx->domid, nr_extents, SUPERPAGE_1GB_SHIFT, 0, sp_extents); + IPRINTF("1G %lu -> %ld\n", nr_extents, done); + if (done > 0) { + success = true; + ctx->x86_hvm.restore.alloc_cnt += count; + stat_1g = done; + for (i = 0; i < (count >> SUPERPAGE_2MB_SHIFT); i++) + set_bit((base_pfn >> SUPERPAGE_2MB_SHIFT) + i, ctx->x86_hvm.restore.attempted_2m); + } + } + + if (!test_and_set_bit(idx_2m, ctx->x86_hvm.restore.attempted_2m)) { + count = 1UL << SUPERPAGE_2MB_SHIFT; + base_pfn = (pfn >> SUPERPAGE_2MB_SHIFT) << SUPERPAGE_2MB_SHIFT; + nr_extents = count >> SUPERPAGE_2MB_SHIFT; + IPRINTF("base_pfn %lx count %lu nr_extents %lu\n", (long)base_pfn, count, nr_extents); + for ( i = 0; i < nr_extents; i++ ) + sp_extents[i] = base_pfn + (i<<SUPERPAGE_2MB_SHIFT); + done = xc_domain_populate_physmap(xch, ctx->domid, nr_extents, SUPERPAGE_2MB_SHIFT, 0, sp_extents); + IPRINTF("2M %lu -> %ld\n", nr_extents, done); + if (done > 0) { + success = true; + ctx->x86_hvm.restore.alloc_cnt += count; + stat_2m = done; + } + } + if (success == false) { + count = 1; + sp_extents[0] = base_pfn = pfn; + done = xc_domain_populate_physmap(xch, ctx->domid, count, 0, 0, sp_extents); + if (done > 0) { + success = true; + ctx->x86_hvm.restore.alloc_cnt += count; + stat_4k = count; + } + } + IPRINTF("count %lu\n", count); + IPRINTF("1G %lu 2M %lu 4k %lu\n", stat_1g, stat_2m, stat_4k); + if (success == true) { + do { + count--; + rc = pfn_set_allocated(ctx, base_pfn + count); + if (rc) + break; + } while (count); + } + return rc; +} + struct xc_sr_restore_ops restore_ops_x86_hvm = { + .allocate_pfn = x86_hvm_allocate_pfn, .pfn_is_valid = x86_hvm_pfn_is_valid, .pfn_to_gfn = x86_hvm_pfn_to_gfn, .set_gfn = x86_hvm_set_gfn, diff --git a/tools/libxc/xc_sr_restore_x86_pv.c b/tools/libxc/xc_sr_restore_x86_pv.c index 50e25c162c..c426f14c73 100644 --- a/tools/libxc/xc_sr_restore_x86_pv.c +++ b/tools/libxc/xc_sr_restore_x86_pv.c @@ -1152,8 +1152,15 @@ static int x86_pv_cleanup(struct xc_sr_context *ctx) return 0; } +static int x86_pv_allocate_pfn(struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + errno = ENOMEM; + return -1; +} + struct xc_sr_restore_ops restore_ops_x86_pv = { + .allocate_pfn = x86_pv_allocate_pfn, .pfn_is_valid = x86_pv_pfn_is_valid, .pfn_to_gfn = pfn_to_mfn, .set_page_type = x86_pv_set_page_type, Attachment:
signature.asc _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx https://lists.xen.org/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |