[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH] Make restore work properly with PV superpage flag



George, Keir, and thoughts or opinions on this patch?

On Fri, 2012-10-12 at 18:32 +0100, Dave McCracken wrote:
> For PV guests, the superpage flag means to unconditionally allocate all
> pages as superpages, which is required for Linux hugepages to work.  Support
> for this on restore was not supported.  This patch adds proper support for
> the superpage flag on restore.
> 
> For HVM guests, the superpage flag has been used to mean attempt to allocate
> superpages if possible on restore.  This functionality has been preserved.
> 
> Signed-off-by: Dave McCracken <dave.mccracken@xxxxxxxxxx>
> 
> --------
> 
>  xc_domain_restore.c |  130 
> ++++++++++++++++++++++++++++++++++++++++------------
>  1 file changed, 101 insertions(+), 29 deletions(-)
> 
> 
> --- xen-unstable/tools/libxc/xc_domain_restore.c      2012-08-18 
> 10:04:53.000000000 -0500
> +++ xen-unstable-restore//tools/libxc/xc_domain_restore.c     2012-08-20 
> 07:24:22.000000000 -0500
> @@ -23,6 +23,19 @@
>   *
>   */
>  
> +/*
> + * The superpages flag in restore has two different meanings depending on
> + * the type of domain.
> + *
> + * For an HVM domain, the flag means to look for properly aligned contiguous
> + * pages and try to allocate a superpage to satisfy it.  If that fails,
> + * fall back to small pages.
> + *
> + * For a PV domain, the flag means allocate all memory as superpages.  If 
> that
> + * fails, the restore fails.  This behavior is required for PV guests who
> + * want to use superpages.
> + */
> +
>  #include <stdlib.h>
>  #include <unistd.h>
>  
> @@ -41,6 +54,9 @@ struct restore_ctx {
>      xen_pfn_t *live_p2m; /* Live mapping of the table mapping each PFN to 
> its current MFN. */
>      xen_pfn_t *p2m; /* A table mapping each PFN to its new MFN. */
>      xen_pfn_t *p2m_batch; /* A table of P2M mappings in the current region.  
> */
> +    xen_pfn_t *p2m_saved_batch; /* Copy of p2m_batch array for pv superpage 
> alloc */
> +    int superpages; /* Superpage allocation has been requested */
> +    int hvm;    /* This is an hvm domain */
>      int completed; /* Set when a consistent image is available */
>      int last_checkpoint; /* Set when we should commit to the current 
> checkpoint when it completes. */
>      int compressing; /* Set when sender signals that pages would be sent 
> compressed (for Remus) */
> @@ -49,11 +65,6 @@ struct restore_ctx {
>  
>  #define HEARTBEAT_MS 1000
>  
> -#define SUPERPAGE_PFN_SHIFT  9
> -#define SUPERPAGE_NR_PFNS    (1UL << SUPERPAGE_PFN_SHIFT)
> -
> -#define SUPER_PAGE_START(pfn)    (((pfn) & (SUPERPAGE_NR_PFNS-1)) == 0 )
> -
>  #ifndef __MINIOS__
>  static ssize_t rdexact(xc_interface *xch, struct restore_ctx *ctx,
>                         int fd, void* buf, size_t size)
> @@ -103,6 +114,49 @@ static ssize_t rdexact(xc_interface *xch
>  #else
>  #define RDEXACT read_exact
>  #endif
> +
> +#define SUPERPAGE_PFN_SHIFT  9
> +#define SUPERPAGE_NR_PFNS    (1UL << SUPERPAGE_PFN_SHIFT)
> +#define SUPERPAGE(_pfn) ((_pfn) & (~(SUPERPAGE_NR_PFNS-1)))
> +#define SUPER_PAGE_START(pfn)    (((pfn) & (SUPERPAGE_NR_PFNS-1)) == 0 )

Why this code motion?

> +
> +/*
> +** When we're restoring into a pv superpage-allocated guest, we take
> +** a copy of the p2m_batch array to preserve the pfn, then allocate the
> +** corresponding superpages.  We then fill in the p2m array using the saved
> +** pfns.

So the pfns are not contiguous, but we back them with a super page? IOW
they are super pages in machine address space but not physical address
space?

If they are contig in p-space you just need to save the first one?

> +*/
> +static int alloc_superpage_mfns(
> +    xc_interface *xch, uint32_t dom, struct restore_ctx *ctx, int nr_mfns)
> +{
> +    int i, j, max = 0;
> +    unsigned long pfn, base_pfn, mfn;
> +    
> +    for (i = 0; i < nr_mfns; i++)
> +    {
> +        pfn = ctx->p2m_batch[i];
> +        base_pfn = SUPERPAGE(pfn);
> +        if (ctx->p2m[base_pfn] != (INVALID_P2M_ENTRY-2))
> +        {
> +            ctx->p2m_saved_batch[max] = base_pfn;

Is p2m_saved_batch used anywhere other than in this function? Could it
be a local variable?

> +            ctx->p2m_batch[max] = base_pfn;
> +            max++;
> +            ctx->p2m[base_pfn] = INVALID_P2M_ENTRY-2;

What is this magic -2? Can we get a symbolic constant (probably for the
whole expression)?

> +        }
> +    }
> +    if (xc_domain_populate_physmap_exact(xch, dom, max, SUPERPAGE_PFN_SHIFT,
> +                                         0, ctx->p2m_batch) != 0)
> +        return 1;
> +
> +    for (i = 0; i < max; i++)
> +    {
> +        mfn = ctx->p2m_batch[i];
> +        pfn = ctx->p2m_saved_batch[i];

If the orriginal was != (INVALID_P2M_ENTRY-2) then you didn't initialise
p2m_saved_batch for this index?

> +        for (j = 0; j < SUPERPAGE_NR_PFNS; j++)
> +            ctx->p2m[pfn++] = mfn++;
> +    }
> +    return 0;
> +}
>  /*
>  ** In the state file (or during transfer), all page-table pages are
>  ** converted into a 'canonical' form where references to actual mfns
> @@ -113,7 +167,7 @@ static ssize_t rdexact(xc_interface *xch
>  static int uncanonicalize_pagetable(
>      xc_interface *xch, uint32_t dom, struct restore_ctx *ctx, void *page)
>  {
> -    int i, pte_last, nr_mfns = 0;
> +    int i, rc, pte_last, nr_mfns = 0;
>      unsigned long pfn;
>      uint64_t pte;
>      struct domain_info_context *dinfo = &ctx->dinfo;
> @@ -152,13 +206,20 @@ static int uncanonicalize_pagetable(
>      }
>  
>      /* Allocate the requisite number of mfns. */
> -    if ( nr_mfns &&
> -         (xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0,
> -                                            ctx->p2m_batch) != 0) )
> -    { 
> -        ERROR("Failed to allocate memory for batch.!\n"); 
> -        errno = ENOMEM;
> -        return 0; 
> +    if (nr_mfns)
> +    {
> +        if (!ctx->hvm && ctx->superpages)
> +            rc = alloc_superpage_mfns(xch, dom, ctx, nr_mfns);
> +        else
> +            rc = xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0,
> +                                                  ctx->p2m_batch);

Would it be clearer to rename alloc_superpage_mfns to (e.g.)
alloc_guest_mfns and have it start:
        if ( ctx->hvm || !ctx->superpages )
                return xc_domain...

Keeps the allocation logic in one place, since it is repeated again
below.

> +    
> +        if (rc)
> +        {
> +            ERROR("Failed to allocate memory for batch.!\n"); 
> +            errno = ENOMEM;
> +            return 0; 
> +        }
>      }
>      
>      /* Second pass: uncanonicalize each present PTE */
> @@ -1018,8 +1079,8 @@ static int pagebuf_get(xc_interface *xch
>  
>  static int apply_batch(xc_interface *xch, uint32_t dom, struct restore_ctx 
> *ctx,
>                         xen_pfn_t* region_mfn, unsigned long* pfn_type, int 
> pae_extended_cr3,
> -                       unsigned int hvm, struct xc_mmu* mmu,
> -                       pagebuf_t* pagebuf, int curbatch, int superpages)
> +                       struct xc_mmu* mmu,
> +                       pagebuf_t* pagebuf, int curbatch)
>  {
>      int i, j, curpage, nr_mfns;
>      int k, scount;
> @@ -1061,7 +1122,7 @@ static int apply_batch(xc_interface *xch
>                  /* Is this the next expected continuation? */
>                  if ( pfn == superpage_start + scount )
>                  {
> -                    if ( !superpages )
> +                    if ( !ctx->superpages )
>                      {
>                          ERROR("Unexpexted codepath with no superpages");
>                          return -1;
> @@ -1114,16 +1175,17 @@ static int apply_batch(xc_interface *xch
>              }
>  
>              /* Are we ready to start a new superpage candidate? */
> -            if ( superpages && SUPER_PAGE_START(pfn) )
> +            if ( ctx->hvm && ctx->superpages && SUPER_PAGE_START(pfn) )

Can we push the movement of these things into the ctx struct into a
separate preparatory patch so it's easier to see the actual code
changes.

>              {
>                  superpage_start=pfn;
>                  scount++;
> -                continue;
>              }
> -            
> -            /* Add the current pfn to pfn_batch */
> -            ctx->p2m_batch[nr_mfns++] = pfn; 
> -            ctx->p2m[pfn]--;
> +            else
> +            {
> +                /* Add the current pfn to pfn_batch */
> +                ctx->p2m_batch[nr_mfns++] = pfn; 
> +                ctx->p2m[pfn]--;
> +            }
>          }
>      }
>  
> @@ -1144,9 +1206,14 @@ static int apply_batch(xc_interface *xch
>      {
>          DPRINTF("Mapping order 0,  %d; first pfn %lx\n", nr_mfns, 
> ctx->p2m_batch[0]);
>      
> -        if(xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0,
> -                                            0, ctx->p2m_batch) != 0) 
> -        { 
> +        if (!ctx->hvm && ctx->superpages)
> +            rc = alloc_superpage_mfns(xch, dom, ctx, nr_mfns);
> +        else
> +            rc = xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0,
> +                                                  ctx->p2m_batch);
> +    
> +        if (rc)
> +        {
>              ERROR("Failed to allocate memory for batch.!\n"); 
>              errno = ENOMEM;
>              return -1;
> @@ -1175,7 +1242,7 @@ static int apply_batch(xc_interface *xch
>               || pagetype == XEN_DOMCTL_PFINFO_XALLOC )
>              region_mfn[i] = ~0UL; /* map will fail but we don't care */
>          else
> -            region_mfn[i] = hvm ? pfn : ctx->p2m[pfn]; 
> +            region_mfn[i] = ctx->hvm ? pfn : ctx->p2m[pfn]; 
>      }
>  
>      /* Map relevant mfns */
> @@ -1298,7 +1365,7 @@ static int apply_batch(xc_interface *xch
>              }
>          }
>  
> -        if ( !hvm &&
> +        if ( !ctx->hvm &&
>               xc_add_mmu_update(xch, mmu,
>                                 (((unsigned long long)mfn) << PAGE_SHIFT)
>                                 | MMU_MACHPHYS_UPDATE, pfn) )
> @@ -1389,6 +1456,9 @@ int xc_domain_restore(xc_interface *xch,
>  
>      memset(ctx, 0, sizeof(*ctx));
>  
> +    ctx->superpages = superpages;
> +    ctx->hvm = hvm;
> +
>      ctxt = xc_hypercall_buffer_alloc(xch, ctxt, sizeof(*ctxt));
>  
>      if ( ctxt == NULL )
> @@ -1452,6 +1522,9 @@ int xc_domain_restore(xc_interface *xch,
>  
>      region_mfn = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), 
> PAGE_SHIFT));
>      ctx->p2m_batch = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), 
> PAGE_SHIFT));
> +    if (!ctx->hvm && ctx->superpages)
> +        ctx->p2m_saved_batch =
> +            malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
>  
>      if ( (ctx->p2m == NULL) || (pfn_type == NULL) ||
>           (region_mfn == NULL) || (ctx->p2m_batch == NULL) )
> @@ -1575,8 +1648,7 @@ int xc_domain_restore(xc_interface *xch,
>              int brc;
>  
>              brc = apply_batch(xch, dom, ctx, region_mfn, pfn_type,
> -                              pae_extended_cr3, hvm, mmu, &pagebuf, curbatch,
> -                              superpages);
> +                              pae_extended_cr3, mmu, &pagebuf, curbatch);
>              if ( brc < 0 )
>                  goto out;
>  
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxx
> http://lists.xen.org/xen-devel



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.