[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v4 5/9] tools/libxc: common code



On Wed, 2014-04-30 at 19:36 +0100, Andrew Cooper wrote:
> Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
> Signed-off-by: Frediano Ziglio <frediano.ziglio@xxxxxxxxxx>
> Signed-off-by: David Vrabel <david.vrabel@xxxxxxxxxx>
> ---
>  tools/libxc/saverestore/common.c         |   87 ++++++
>  tools/libxc/saverestore/common.h         |  172 ++++++++++++
>  tools/libxc/saverestore/common_x86.c     |   54 ++++
>  tools/libxc/saverestore/common_x86.h     |   21 ++
>  tools/libxc/saverestore/common_x86_hvm.c |   53 ++++
>  tools/libxc/saverestore/common_x86_pv.c  |  431 
> ++++++++++++++++++++++++++++++
>  tools/libxc/saverestore/common_x86_pv.h  |  104 +++++++
>  tools/libxc/saverestore/restore.c        |  288 ++++++++++++++++++++
>  tools/libxc/saverestore/save.c           |   42 +++
>  9 files changed, 1252 insertions(+)
>  create mode 100644 tools/libxc/saverestore/common_x86.c
>  create mode 100644 tools/libxc/saverestore/common_x86.h
>  create mode 100644 tools/libxc/saverestore/common_x86_hvm.c
>  create mode 100644 tools/libxc/saverestore/common_x86_pv.c
>  create mode 100644 tools/libxc/saverestore/common_x86_pv.h
> 
> diff --git a/tools/libxc/saverestore/common.c 
> b/tools/libxc/saverestore/common.c
> index de2e727..b159c4c 100644
> --- a/tools/libxc/saverestore/common.c
> +++ b/tools/libxc/saverestore/common.c
> @@ -1,3 +1,5 @@
> +#include <assert.h>
> +
>  #include "common.h"
>  
>  static const char *dhdr_types[] =
> @@ -52,6 +54,91 @@ const char *rec_type_to_str(uint32_t type)
>      return "Reserved";
>  }
>  
> +int write_split_record(struct context *ctx, struct record *rec,
> +                       void *buf, size_t sz)
> +{
> +    static const char zeroes[7] = { 0 };
> +    xc_interface *xch = ctx->xch;
> +    uint32_t combined_length = rec->length + sz;
> +    size_t record_length = (combined_length + 7) & ~7UL;

Isn't this ROUNDUP(combined_length, 3)? (Where 3 and/or 7 perhaps ought
to be given names)

> +
> +    if ( record_length > REC_LENGTH_MAX )
> +    {
> +        ERROR("Record (0x%08"PRIx32", %s) length 0x%"PRIx32
> +              " exceeds max (0x%"PRIx32")", rec->type,
> +              rec_type_to_str(rec->type), rec->length, REC_LENGTH_MAX);
> +        return -1;
> +    }
> +
> +    if ( rec->length )
> +        assert(rec->data);
> +    if ( sz )
> +        assert(buf);
> +
> +    if ( write_exact(ctx->fd, &rec->type, sizeof rec->type) ||

libxc style appears to be "sizeof (rec->type)" (the space seems odd to
me but is apparently The Style, I think the brackets are a must).

> +         write_exact(ctx->fd, &combined_length, sizeof rec->length) ||
> +         (rec->length && write_exact(ctx->fd, rec->data, rec->length)) ||
> +         (sz && write_exact(ctx->fd, buf, sz)) ||
> +         write_exact(ctx->fd, zeroes, record_length - combined_length) )
> +    {
> +        PERROR("Unable to write record to stream");
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +int read_record(struct context *ctx, struct record *rec)
> +{
> +    xc_interface *xch = ctx->xch;
> +    struct rhdr rhdr;
> +    size_t datasz;
> +
> +    if ( read_exact(ctx->fd, &rhdr, sizeof rhdr) )
> +    {
> +        PERROR("Failed to read Record Header from stream");
> +        return -1;
> +    }
> +    else if ( rhdr.length > REC_LENGTH_MAX )
> +    {
> +        ERROR("Record (0x%08"PRIx32", %s) length 0x%"PRIx32
> +              " exceeds max (0x%"PRIx32")",
> +              rhdr.type, rec_type_to_str(rhdr.type),
> +              rhdr.length, REC_LENGTH_MAX);
> +        return -1;
> +    }
> +
> +    datasz = (rhdr.length + 7) & ~7U;

Another ROUNDUP and #define XXX 3 or 7 opportunity. (I'm not going to
mention any more of these I find).

Is it not a bug in the input for datasz to not be aligned? I thought you
were padding everything.

> diff --git a/tools/libxc/saverestore/common.h 
> b/tools/libxc/saverestore/common.h
> index fff0a39..a35eda7 100644
> --- a/tools/libxc/saverestore/common.h
> +++ b/tools/libxc/saverestore/common.h
> @@ -1,7 +1,20 @@
>  #ifndef __COMMON__H
>  #define __COMMON__H
>  
> +#include <stdbool.h>
> +
> +// Hack out junk from the namespace
 ... namespace, because/in order to...

(to prevent accidents I suppose?)

> +#define mfn_to_pfn __UNUSED_mfn_to_pfn
> +#define pfn_to_mfn __UNUSED_pfn_to_mfn
> +
>  #include "../xg_private.h"
> +#include "../xg_save_restore.h"
> +#include "../xc_dom.h"
> +#include "../xc_bitops.h"
> +
> +#undef mfn_to_pfn
> +#undef pfn_to_mfn
> +
>  
> [...]

> +struct context
> +{
> +    xc_interface *xch;
> +    uint32_t domid;
> +    int fd;
> +
> +    xc_dominfo_t dominfo;
> +
> +    struct save_restore_ops ops;
> +
> +    union
> +    {
> +        struct
> +        {
> +            /* From Image Header */
> +            uint32_t format_version;
> +
> +            /* From Domain Header */
> +            uint32_t guest_type;
> +            uint32_t guest_page_size;
> +

I suppose the following block is not from the Domain Header. /* Populate
by builder */ or something?

[...]

> +    union
> +    {
> +        struct
> +        {
> +            /* 4 or 8; 32 or 64 bit domain */
> +            unsigned int width;
> +            /* 3 or 4 pagetable levels */
> +            unsigned int levels;
> +
> +

Deliberate double spacing? (You don't do it consistently)
[...]

> +};
> +
> +/*
> + * Write the image and domain headers to the stream.
> + * (to eventually make static in save.c)

Is this because of the 2/9 HACK to maintain both? So far it seems to me
that the next iteration could drop that and go for the big switcheroo,
but if not could we agree some tag for things which are specific to that
in order to distinguish from longer term todos or hacky bits? (So I know
which to ignore and which to consider)

> + */
> +int write_headers(struct context *ctx, uint16_t guest_type);
> +
> +extern struct save_restore_ops save_restore_ops_x86_pv;
> +extern struct save_restore_ops save_restore_ops_x86_hvm;

From the union in the context struct  I'm only expecting x86_pv in this
patch, so perhaps x86_hvm was intended to arrive later?

(a non-zero length commit message might have confirmed what was intended
to be supplied here)

> +/*
> + * Writes a split record to the stream, applying correct padding where
> + * appropriate.  It is common when sending records containing blobs from Xen
> + * that the header and blob data are separate.  This function accepts a 
> second
> + * buffer and length, and will merge it with the main record when sending.
> + *
> + * Records with a non-zero length must provide a valid data field; records
> + * with a 0 length shall have their data field ignored.
> + *
> + * Returns 0 on success and non0 on failure.

"non-0". Do the non-zero values have any particular meanings? (errno
codes, other error codes?)

> + */
> +int write_split_record(struct context *ctx, struct record *rec, void *buf, 
> size_t sz);
> +
> +/*
> + * Writes a record to the stream, applying correct padding where appropriate.
> + * Records with a non-zero length must provide a valid data field; records
> + * with a 0 length shall have their data field ignored.
> + *
> + * Returns 0 on success and non0 on failure.
> + */
> +static inline int write_record(struct context *ctx, struct record *rec)
> +{
> +    return write_split_record(ctx, rec, NULL, 0);
> +}
> +
> +/*
> + * Reads a record from the stream, and fills in the record structure.
> + *
> + * Returns 0 on success and non-0 on failure.
> + *
> + * On success, the records type and size shall be valid.
> + * - If size is 0, data shall be NULL.
> + * - If size is non-0, data shall be a buffer allocated by malloc() which 
> must
> + *   be passed to free() by the caller.
> + *
> + * On failure, the contents of the record structure are undefined.
> + */
> +int read_record(struct context *ctx, struct record *rec);
> +
> +int write_page_data_and_pause(struct context *ctx);

I suppose the page_data referred to here is accumulated in ctx?

> +int handle_page_data(struct context *ctx, struct record *rec);

Handle it how? Is this the restore side counterpart to
write_page_data_...?

> +int populate_pfns(struct context *ctx, unsigned count,
> +                  const xen_pfn_t *original_pfns, const uint32_t *types);

Populate from what?

(You spoiled me with doc comments on the first few, so I'm being picky
here ;-))

> +
>  #endif
>  /*
>   * Local variables:
> diff --git a/tools/libxc/saverestore/common_x86_hvm.c 
> b/tools/libxc/saverestore/common_x86_hvm.c
> new file mode 100644
> index 0000000..0b9aac2
> --- /dev/null
> +++ b/tools/libxc/saverestore/common_x86_hvm.c

How delightfully uninteresting.

> diff --git a/tools/libxc/saverestore/common_x86_pv.c 
> b/tools/libxc/saverestore/common_x86_pv.c
> new file mode 100644
> index 0000000..35bce27
> --- /dev/null
> +++ b/tools/libxc/saverestore/common_x86_pv.c

How terrifying.

> @@ -0,0 +1,431 @@
> +#include <assert.h>
> +
> +#include "common_x86_pv.h"
> +
> +xen_pfn_t mfn_to_pfn(struct context *ctx, xen_pfn_t mfn)
> +{
> +    assert(mfn <= ctx->x86_pv.max_mfn);

Just to confirm that anywhere there is an assert like this then if it is
based on potentially user controller input then it has already been
validated elsewhere first? (with the abstraction I couldn't spot where
that was)

> +    return ctx->x86_pv.m2p[mfn];
> +}
> +
> +static bool x86_pv_pfn_is_valid(struct context *ctx, xen_pfn_t pfn)
> +{
> +    return pfn <= ctx->x86_pv.max_pfn;
> +}
> +
> +static xen_pfn_t x86_pv_pfn_to_gfn(struct context *ctx, xen_pfn_t pfn)
> +{
> +    assert(pfn <= ctx->x86_pv.max_pfn);
> +
> +    if ( ctx->x86_pv.width == sizeof (uint64_t) )
> +        /* 64 bit guest.  Need to truncate their pfns for 32 bit toolstacks 
> */
> +        return ((uint64_t *)ctx->x86_pv.p2m)[pfn];
> +    else
> +    {
> +        /* 32 bit guest.  Need to expand INVALID_MFN fot 64 bit toolstacks */

Typo: "fot"

> +        uint32_t mfn = ((uint32_t *)ctx->x86_pv.p2m)[pfn];
> +
> +        return mfn == ~0U ? INVALID_MFN : mfn;
> +    }
> +}
> +
> +static void x86_pv_set_page_type(struct context *ctx, xen_pfn_t pfn,
> +                                 unsigned long type)
> +{
> +    assert(pfn <= ctx->x86_pv.max_pfn);
> +
> +    ctx->x86_pv.pfn_types[pfn] = type;
> +}
> +
> +static void x86_pv_set_gfn(struct context *ctx, xen_pfn_t pfn,
> +                           xen_pfn_t mfn)
> +{
> +    assert(pfn <= ctx->x86_pv.max_pfn);
> +
> +    if ( ctx->x86_pv.width == sizeof (uint64_t) )
> +        /* 64 bit guest.  Need to expand INVALID_MFN for 32 bit toolstacks */
> +        ((uint64_t *)ctx->x86_pv.p2m)[pfn] = mfn == INVALID_MFN ? ~0ULL : 
> mfn;
> +    else
> +        /* 32 bit guest.  Can safely truncate INVALID_MFN fot 64 bit 
> toolstacks */

Another fot.

> +        ((uint32_t *)ctx->x86_pv.p2m)[pfn] = mfn;
> +}
> +
> +static int normalise_pagetable(struct context *ctx, const uint64_t *src,
> +                               uint64_t *dst, unsigned long type)
> +{
> +    xc_interface *xch = ctx->xch;
> +    uint64_t pte;
> +    unsigned i, xen_first = -1, xen_last = -1; /* Indicies of Xen mappings */
> +
> +    type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
> +
> +    if ( ctx->x86_pv.levels == 4 )
> +    {
> +        /* 64bit guests only have Xen mappings in their L4 tables */

You (correctly) infer from the pt levels that this must be a 64-bit
guest. Assert?

> +        if ( type == XEN_DOMCTL_PFINFO_L4TAB )
> +        {
> +            xen_first = 256;
> +            xen_last = 271;

Can we #define all these numbers please?

> +        }
> +    }
> +    else
> +    {

Must be 32 bit? Assert?

> +        switch ( type )
> +        {
> +        case XEN_DOMCTL_PFINFO_L4TAB:
> +            ERROR("??? Found L4 table for 32bit guest");
> +            errno = EINVAL;
> +            return -1;
> +
> +        case XEN_DOMCTL_PFINFO_L3TAB:
> +            /* 32bit guests can only use the first 4 entries of their L3 
> tables.
> +             * All other are potentially used by Xen. */
> +            xen_first = 4;
> +            xen_last = 512;
> +            break;
> +
> +        case XEN_DOMCTL_PFINFO_L2TAB:
> +            /* It is hard to spot Xen mappings in a 32bit guest's L2.  Most
> +             * are normal but only a few will have Xen mappings.

Internally Xen has PGT_pae_xen_l2, I wonder if it could be coaxed into
exposing that here too?

> +             * 428 = (HYPERVISOR_VIRT_START_PAE >> L2_PAGETABLE_SHIFT_PAE) & 
> 0x1ff
> +             *
> +             * ...which is conveniently unavailable to us in a 64bit build.

Can we export it somehow? (Perhaps with COMPAT in the name)

> ... normalize_page
> +    local_page = malloc(PAGE_SIZE);

How bad is the overhead of (what I pressume are) all these allocs and
frees? (I've not come across the caller in this patch, maybe it will
become clear later).

Would it be worth keeping an array around shadowing the "live" batch of
pages?

> +static int x86_pv_localise_page(struct context *ctx, uint32_t type, void 
> *page)

"localise" means the inverse of "normalise"?


> +void pseudophysmap_walk(struct context *ctx, xen_pfn_t mfn)
> +{
> +    xc_interface *xch = ctx->xch;
> +    xen_pfn_t pfn = ~0UL;
> +
> +    ERROR("mfn %#lx, max %#lx", mfn, ctx->x86_pv.max_mfn);

It took me a while to figure out why these were all errors. Can we call
the function dump_bad_pseudophysmap_walk or some such?

> +xen_pfn_t cr3_to_mfn(struct context *ctx, uint64_t cr3)
> +{
> +    if ( ctx->x86_pv.width == 8 )
> +        return cr3 >> 12;
> +    else
> +        return (((uint32_t)cr3 >> 12) | ((uint32_t)cr3 << 20));

A comment pointing people to the phrase "extended-cr3" as grep fodder
would be appreciated.

> +}
> +
> +uint64_t mfn_to_cr3(struct context *ctx, xen_pfn_t mfn)
> +{
> +    if ( ctx->x86_pv.width == 8 )
> +        return ((uint64_t)mfn) << 12;
> +    else
> +        return (((uint32_t)mfn << 12) | ((uint32_t)mfn >> 20));
> +}
> +
[...]
> +    else if ( guest_width == 4 )

Would prefer to drop the else and insert a blank line to separate the
error handling of the xc_domain_.. from the interpretation of its
result.

> +        guest_levels = 3;
> +    else if ( guest_width == 8 )
> +        guest_levels = 4;
> +    else
> +    {
> +        ERROR("Invalid guest width %d.  Expected 32 or 64", guest_width);

Actually you expected 4 or 8. I'm not sure if this could be strengthened
into an assert.

> +        return -1;
> +    }
> +    ctx->x86_pv.width = guest_width;
> +    ctx->x86_pv.levels = guest_levels;
> +    ctx->x86_pv.fpp = fpp = PAGE_SIZE / ctx->x86_pv.width;
> +
> +    DPRINTF("%d bits, %d levels", guest_width * 8, guest_levels);
> +
> +    /* Get the domains maximum pfn */

Apostrophes are out of fashion at the moment I guess ;-).

> +    max_pfn = xc_domain_maximum_gpfn(xch, ctx->domid);
> +    if ( max_pfn < 0 )
> +    {
> +        PERROR("Unable to obtain guests max pfn");
> +        return -1;
> +    }
> +    else if ( max_pfn >= ~XEN_DOMCTL_PFINFO_LTAB_MASK )
> +    {
> +        errno = E2BIG;
> +        PERROR("Cannot save a guest this large %#x");

Missing the parameter for %#x.

> [...]

> +/*
> + * Extract an MFN from a Pagetable Entry.
> + */
> +static inline xen_pfn_t pte_to_frame(struct context *ctx, uint64_t pte)
> +{
> +    if ( ctx->x86_pv.width == 8 )
> +        return (pte >> PAGE_SHIFT) & ((1ULL << (52 - PAGE_SHIFT)) - 1);
> +    else
> +        return (pte >> PAGE_SHIFT) & ((1ULL << (44 - PAGE_SHIFT)) - 1);

Are bits 45..52 non zero in this case?

Can we #define a couple of MASK's please.

> +static int pfn_set_populated(struct context *ctx, xen_pfn_t pfn)
> +{
> +    xc_interface *xch = ctx->xch;
> +
> +    if ( !ctx->restore.populated_pfns || pfn > 
> ctx->restore.max_populated_pfn )
> +    {
> +        unsigned long new_max_pfn = ((pfn + 1024) & ~1023) - 1;
> +        size_t old_sz, new_sz;
> +        unsigned long *p;
> +
> +        old_sz = bitmap_size(ctx->restore.max_populated_pfn + 1);
> +        new_sz = bitmap_size(new_max_pfn + 1);
> +
> +        p  = realloc(ctx->restore.populated_pfns, new_sz);

In practice do you not see an initial run of increasing pfns and end up
constantly adding one more? Consider doubling or adding e.g. 128M worth
of space when growing?
> +
> +    if ( nr_pages > 0 )
> +    {
> +        mapping = guest_page = xc_map_foreign_bulk(
> +            xch, ctx->domid, PROT_READ | PROT_WRITE,
> +            mfns, map_errs, nr_pages);
> +        if ( !mapping )
> +        {
> +            PERROR("Unable to map %u mfns for %u pages of data",
> +                   nr_pages, count);
> +            goto err;
> +        }
> +    }
> +
> +    for ( i = 0, j = 0; i < count; ++i )
> +    {
> +        switch ( types[i] )
> +        {
> +        case XEN_DOMCTL_PFINFO_XTAB:
> +        case XEN_DOMCTL_PFINFO_BROKEN:
> +            /* Nothing at all to do */

Is this a deliberate fall-thru? Please comment if so.

> +        case XEN_DOMCTL_PFINFO_XALLOC:
> +            /* Nothing futher to do */

"further"

> +int handle_page_data(struct context *ctx, struct record *rec)
> +{
> +    xc_interface *xch = ctx->xch;
> +    struct rec_page_data_header *pages = rec->data;
> +    unsigned i, pages_of_data = 0;
> +    int rc = -1;
> +
> +    xen_pfn_t *pfns = NULL, pfn;
> +    uint32_t *types = NULL, type;
> +
> +    static unsigned pg_count;
> +    pg_count++;
> +
> +    if ( rec->length < sizeof *pages )
> +    {
> +        ERROR("PAGE_DATA record trucated: length %"PRIu32", min %zu",
> +              rec->length, sizeof *pages);
> +        goto err;
> +    }
> +    else if ( pages->count < 1 )

We mostly omit the else where the previous if returns or gotos etc.

> +    for ( i = 0; i < pages->count; ++i )
> +    {
> +        pfn = pages->pfn[i] & PAGE_DATA_PFN_MASK;
> +        if ( !ctx->ops.pfn_is_valid(ctx, pfn) )
> +        {
> +            ERROR("pfn %#lx (index %u) outside domain maximum", pfn, i);
> +            goto err;
> +        }
> +
> +        type = (pages->pfn[i] & PAGE_DATA_TYPE_MASK) >> 32;
> +        if ( ((type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT) >= 5) &&
> +             ((type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT) <= 8) )

What are 5 and 8?

This might all be clearer with the use of a #define or two, and perhaps
using a switch over the expected valid types and explicitly invalid
types.


> diff --git a/tools/libxc/saverestore/save.c b/tools/libxc/saverestore/save.c
> index c013e62..e842e6c 100644
> --- a/tools/libxc/saverestore/save.c
> +++ b/tools/libxc/saverestore/save.c
> @@ -1,5 +1,47 @@
> +#include <arpa/inet.h>
> +
>  #include "common.h"
>  
> +int write_headers(struct context *ctx, uint16_t guest_type)
> +{
> +    xc_interface *xch = ctx->xch;
> +    int32_t xen_version = xc_version(xch, XENVER_version, NULL);
> +    struct ihdr ihdr =
> +        {

I think coding style puts this on the previous line and the body should
be 4 spaces further in.

> +            .marker  = IHDR_MARKER,
> +            .id      = htonl(IHDR_ID),
> +            .version = htonl(IHDR_VERSION),
> +            .options = htons(IHDR_OPT_LITTLE_ENDIAN),
> +        };
> +    struct dhdr dhdr =
> +        {
> +            .type       = guest_type,
> +            .page_shift = 12,

We have a define for this.

Ian.


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.