[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [PATCH v2 2/2] live migration: use superpages for physmap population on restore when possible
On 06.09.2022 11:54, Andrei Semenov wrote: > Implement an heuristic for X86 HVM guests which tries to use superpages while > populating guest physmap on live migration. This should impove memory accesses > performances for these guests. > > Signed-off-by: Andrei Semenov <andrei.semenov@xxxxxxxx> Olaf - I recall you've done some similar work before. Do you have any thoughts here, perhaps going as far as merging your and Andrei's work? Jan > --- > tools/include/xen-tools/libs.h | 4 ++ > tools/libs/guest/xg_private.h | 3 + > tools/libs/guest/xg_sr_common.h | 18 ++++- > tools/libs/guest/xg_sr_restore.c | 60 +++++++--------- > tools/libs/guest/xg_sr_restore_x86_hvm.c | 88 +++++++++++++++++++++++- > tools/libs/guest/xg_sr_restore_x86_pv.c | 22 +++++- > 6 files changed, 154 insertions(+), 41 deletions(-) > > diff --git a/tools/include/xen-tools/libs.h b/tools/include/xen-tools/libs.h > index a16e0c3807..bdd903eb7b 100644 > --- a/tools/include/xen-tools/libs.h > +++ b/tools/include/xen-tools/libs.h > @@ -63,4 +63,8 @@ > #define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & > ~((1UL<<(_w))-1)) > #endif > > +#ifndef ROUNDDOWN > +#define ROUNDDOWN(_x,_w) ((unsigned long)(_x) & (-1UL << (_w))) > +#endif > + > #endif /* __XEN_TOOLS_LIBS__ */ > diff --git a/tools/libs/guest/xg_private.h b/tools/libs/guest/xg_private.h > index 09e24f1227..dcf63b5188 100644 > --- a/tools/libs/guest/xg_private.h > +++ b/tools/libs/guest/xg_private.h > @@ -134,6 +134,9 @@ typedef uint64_t x86_pgentry_t; > #define PAGE_SIZE_X86 (1UL << PAGE_SHIFT_X86) > #define PAGE_MASK_X86 (~(PAGE_SIZE_X86-1)) > > +#define S_PAGE_1GB_ORDER 18 > +#define S_PAGE_2MB_ORDER 9 > + > #define NRPAGES(x) (ROUNDUP(x, PAGE_SHIFT) >> PAGE_SHIFT) > > static inline xen_pfn_t xc_pfn_to_mfn(xen_pfn_t pfn, xen_pfn_t *p2m, > diff --git a/tools/libs/guest/xg_sr_common.h b/tools/libs/guest/xg_sr_common.h > index 941e24d7b7..96365e05a8 100644 > --- a/tools/libs/guest/xg_sr_common.h > +++ b/tools/libs/guest/xg_sr_common.h > @@ -137,7 +137,8 @@ struct xc_sr_restore_ops > bool (*pfn_is_valid)(const struct xc_sr_context *ctx, xen_pfn_t pfn); > > /* Set the GFN of a PFN. */ > - void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn); > + void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn, > + unsigned int order); > > /* Set the type of a PFN. */ > void (*set_page_type)(struct xc_sr_context *ctx, xen_pfn_t pfn, > @@ -175,6 +176,17 @@ struct xc_sr_restore_ops > #define BROKEN_CHANNEL 2 > int (*process_record)(struct xc_sr_context *ctx, struct xc_sr_record > *rec); > > + /** > + * Guest physmap population order is based on heuristic which is family > + * dependant. X86 HVM heuristic is interested in observing the whole > + * record (the first) in order to guess how the physmap should be > populated. > + */ > + void (*guess_physmap)(struct xc_sr_context *ctx, unsigned int count, > + const xen_pfn_t *pfns, const uint32_t *types); > + > + /* Get the physmap population order for given PFN */ > + int (*get_physmap_order)(const struct xc_sr_context *ctx, xen_pfn_t pfn); > + > /** > * Perform any actions required after the static data has arrived. > Called > * when the STATIC_DATA_COMPLETE record has been recieved/inferred. > @@ -404,6 +416,10 @@ struct xc_sr_context > { > /* HVM context blob. */ > struct xc_sr_blob context; > + > + /* Set guest type (based on the first record) */ > + bool set_guest_type; > + bool pvh_guest; > } restore; > }; > } hvm; > diff --git a/tools/libs/guest/xg_sr_restore.c > b/tools/libs/guest/xg_sr_restore.c > index 074b56d263..af864bd5ea 100644 > --- a/tools/libs/guest/xg_sr_restore.c > +++ b/tools/libs/guest/xg_sr_restore.c > @@ -86,18 +86,21 @@ static bool pfn_is_populated(const struct xc_sr_context > *ctx, xen_pfn_t pfn) > * avoid realloc()ing too excessively, the size increased to the nearest > power > * of two large enough to contain the required pfn. > */ > -static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn) > +static int pfn_set_populated(struct xc_sr_context *ctx, xen_pfn_t pfn, > + unsigned int order) > { > xc_interface *xch = ctx->xch; > + xen_pfn_t start_pfn = ROUNDDOWN(pfn, order), > + end_pfn = (ROUNDUP(pfn + 1, order) - 1); > > - if ( pfn > ctx->restore.max_populated_pfn ) > + if ( end_pfn > ctx->restore.max_populated_pfn ) > { > xen_pfn_t new_max; > size_t old_sz, new_sz; > unsigned long *p; > > /* Round up to the nearest power of two larger than pfn, less 1. */ > - new_max = pfn; > + new_max = end_pfn; > new_max |= new_max >> 1; > new_max |= new_max >> 2; > new_max |= new_max >> 4; > @@ -123,8 +126,11 @@ static int pfn_set_populated(struct xc_sr_context *ctx, > xen_pfn_t pfn) > ctx->restore.max_populated_pfn = new_max; > } > > - assert(!test_bit(pfn, ctx->restore.populated_pfns)); > - set_bit(pfn, ctx->restore.populated_pfns); > + for ( pfn = start_pfn; pfn <= end_pfn; ++pfn ) > + { > + assert(!test_bit(pfn, ctx->restore.populated_pfns)); > + set_bit(pfn, ctx->restore.populated_pfns); > + } > > return 0; > } > @@ -138,60 +144,40 @@ int populate_pfns(struct xc_sr_context *ctx, unsigned > int count, > const xen_pfn_t *original_pfns, const uint32_t *types) > { > xc_interface *xch = ctx->xch; > - xen_pfn_t *mfns = malloc(count * sizeof(*mfns)), > - *pfns = malloc(count * sizeof(*pfns)); > - unsigned int i, nr_pfns = 0; > + xen_pfn_t mfn, pfn; > + unsigned int i, order; > int rc = -1; > > - if ( !mfns || !pfns ) > - { > - ERROR("Failed to allocate %zu bytes for populating the physmap", > - 2 * count * sizeof(*mfns)); > - goto err; > - } > + /* Feed this record for family dependant heuristic to guess the physmap > */ > + ctx->restore.ops.guess_physmap(ctx, count, original_pfns, types); > > for ( i = 0; i < count; ++i ) > { > if ( (!types || page_type_to_populate(types[i])) && > !pfn_is_populated(ctx, original_pfns[i]) ) > { > - rc = pfn_set_populated(ctx, original_pfns[i]); > + order = ctx->restore.ops.get_physmap_order(ctx, > original_pfns[i]); > + rc = pfn_set_populated(ctx, original_pfns[i], order); > if ( rc ) > goto err; > - pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i]; > - ++nr_pfns; > - } > - } > - > - if ( nr_pfns ) > - { > - rc = xc_domain_populate_physmap_exact( > - xch, ctx->domid, nr_pfns, 0, 0, mfns); > - if ( rc ) > - { > - PERROR("Failed to populate physmap"); > - goto err; > - } > > - for ( i = 0; i < nr_pfns; ++i ) > - { > - if ( mfns[i] == INVALID_MFN ) > + pfn = mfn = ROUNDDOWN(original_pfns[i], order); > + rc = xc_domain_populate_physmap_exact(xch, ctx->domid, 1, order, > 0, > + &mfn); > + if ( rc || (mfn == INVALID_MFN) ) > { > - ERROR("Populate physmap failed for pfn %u", i); > + ERROR("Failed to populate physmap for pfn %lu (%u)", pfn, > order); > rc = -1; > goto err; > } > > - ctx->restore.ops.set_gfn(ctx, pfns[i], mfns[i]); > + ctx->restore.ops.set_gfn(ctx, pfn, mfn, order); > } > } > > rc = 0; > > err: > - free(pfns); > - free(mfns); > - > return rc; > } > > diff --git a/tools/libs/guest/xg_sr_restore_x86_hvm.c > b/tools/libs/guest/xg_sr_restore_x86_hvm.c > index d6ea6f3012..2e525443ab 100644 > --- a/tools/libs/guest/xg_sr_restore_x86_hvm.c > +++ b/tools/libs/guest/xg_sr_restore_x86_hvm.c > @@ -110,7 +110,7 @@ static xen_pfn_t x86_hvm_pfn_to_gfn(const struct > xc_sr_context *ctx, > > /* restore_ops function. */ > static void x86_hvm_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn, > - xen_pfn_t gfn) > + xen_pfn_t gfn, unsigned int order) > { > /* no op */ > } > @@ -161,6 +161,8 @@ static int x86_hvm_setup(struct xc_sr_context *ctx) > } > #endif > > + ctx->x86.hvm.restore.set_guest_type = true; > + > return 0; > } > > @@ -192,6 +194,88 @@ static int x86_hvm_process_record(struct xc_sr_context > *ctx, > } > } > > +/* > + * We consider that PVH guest physmap starts from 0 and coninugiously cover > the > + * pysical memory space for the first GB of memory. HVM guest will have I/0 > + * holes in the first 2MB of memory space (at least for VGA). Therefore we > + * should observe the very first record (wich comes in physmap order) to find > + * out how we should map this first GB. > + * To map the rest of the memory space in both cases (PVH or HVM) we will use > + * the maximum available order (up to 1GB), except for forth GB wich holds > the > + * low MMIO hole (at least for LAPIC MMIO window and for potential > passthroughed > + * or emulated PCI devices BARs). > + */ > +static void x86_hvm_guess_physmap(struct xc_sr_context *ctx, unsigned int > count, > + const xen_pfn_t *pfns, const uint32_t *types) > +{ > + xen_pfn_t prev; > + unsigned int i; > + > + > + if ( !ctx->x86.hvm.restore.set_guest_type ) > + return; > + > + for ( i = 0, prev = INVALID_PFN; i < count; ++i ) > + { > + if ( !types || page_type_to_populate(types[i]) ) > + { > + if ( prev == INVALID_MFN ) > + { > + if (pfns[i] != 0) > + break; > + } > + else > + { > + if ( pfns[i] != (prev + 1) ) > + break; > + } > + prev = pfns[i]; > + } > + } > + > + ctx->x86.hvm.restore.pvh_guest = (i == count) ? true : false; > + ctx->x86.hvm.restore.set_guest_type = false; > +} > + > +/* > + * > + */ > +static int x86_hvm_get_physmap_order(const struct xc_sr_context *ctx, > + xen_pfn_t pfn) > +{ > + int order; > + > + if ( pfn >= ctx->restore.p2m_size ) > + return 0; > + > + switch (pfn >> S_PAGE_1GB_ORDER) > + { > + case 3: > + /* The forth GB of memory is mapped with 2MB superpages */ > + order = S_PAGE_2MB_ORDER; > + break; > + case 0: > + if (!ctx->x86.hvm.restore.pvh_guest) > + { > + /* First 2MB are mapped as 4K for HVM guest */ > + order = (pfn > 0x1ff) ? S_PAGE_2MB_ORDER : 0; > + break; > + } > + default: > + order = S_PAGE_1GB_ORDER; > + } > + > + if ( ((ROUNDUP(pfn + 1, S_PAGE_1GB_ORDER) - 1) >= ctx->restore.p2m_size) > && > + order == S_PAGE_1GB_ORDER ) > + order = S_PAGE_2MB_ORDER; > + > + if ( ((ROUNDUP(pfn + 1, S_PAGE_2MB_ORDER) - 1) >= ctx->restore.p2m_size) > && > + order == S_PAGE_2MB_ORDER ) > + order = 0; > + > + return order; > +} > + > /* > * restore_ops function. Sets extra hvm parameters and seeds the grant > table. > */ > @@ -258,6 +342,8 @@ struct xc_sr_restore_ops restore_ops_x86_hvm = > .localise_page = x86_hvm_localise_page, > .setup = x86_hvm_setup, > .process_record = x86_hvm_process_record, > + .guess_physmap = x86_hvm_guess_physmap, > + .get_physmap_order = x86_hvm_get_physmap_order, > .static_data_complete = x86_static_data_complete, > .stream_complete = x86_hvm_stream_complete, > .cleanup = x86_hvm_cleanup, > diff --git a/tools/libs/guest/xg_sr_restore_x86_pv.c > b/tools/libs/guest/xg_sr_restore_x86_pv.c > index dc50b0f5a8..f8545f941a 100644 > --- a/tools/libs/guest/xg_sr_restore_x86_pv.c > +++ b/tools/libs/guest/xg_sr_restore_x86_pv.c > @@ -59,7 +59,7 @@ static int expand_p2m(struct xc_sr_context *ctx, unsigned > long max_pfn) > ctx->x86.pv.max_pfn = max_pfn; > for ( i = (old_max ? old_max + 1 : 0); i <= max_pfn; ++i ) > { > - ctx->restore.ops.set_gfn(ctx, i, INVALID_MFN); > + ctx->restore.ops.set_gfn(ctx, i, INVALID_MFN, 0); > ctx->restore.ops.set_page_type(ctx, i, 0); > } > > @@ -947,9 +947,10 @@ static void x86_pv_set_page_type(struct xc_sr_context > *ctx, xen_pfn_t pfn, > > /* restore_ops function. */ > static void x86_pv_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn, > - xen_pfn_t mfn) > + xen_pfn_t mfn, unsigned int order) > { > assert(pfn <= ctx->x86.pv.max_pfn); > + assert(!order); > > if ( ctx->x86.pv.width == sizeof(uint64_t) ) > /* 64 bit guest. Need to expand INVALID_MFN for 32 bit toolstacks. > */ > @@ -1113,6 +1114,21 @@ static int x86_pv_process_record(struct xc_sr_context > *ctx, > } > } > > +/* > + * There's no reliable heuristic which can predict the PV guest physmap. > + * Therefore the 0 order always will be used. > + */ > +static void x86_pv_guess_physmap(struct xc_sr_context *ctx, unsigned int > count, > + const xen_pfn_t *pfns, const uint32_t > *types) > +{ > +} > + > +static int x86_pv_get_physmap_order(const struct xc_sr_context *ctx, > + xen_pfn_t pfn) > +{ > + return 0; > +} > + > /* > * restore_ops function. Update the vcpu context in Xen, pin the pagetables, > * rewrite the p2m and seed the grant table. > @@ -1194,6 +1210,8 @@ struct xc_sr_restore_ops restore_ops_x86_pv = > .localise_page = x86_pv_localise_page, > .setup = x86_pv_setup, > .process_record = x86_pv_process_record, > + .guess_physmap = x86_pv_guess_physmap, > + .get_physmap_order = x86_pv_get_physmap_order, > .static_data_complete = x86_static_data_complete, > .stream_complete = x86_pv_stream_complete, > .cleanup = x86_pv_cleanup,
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |