[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v10 9/9] libxl: vnuma topology configuration parser and doc



On Wed, Sep 03, 2014 at 12:24:18AM -0400, Elena Ufimtseva wrote:
> Parses vnuma topoplogy number of nodes and memory
> ranges. If not defined, initializes vnuma with
> only one node and default topology. This one node covers
> all domain memory and all vcpus assigned to it.
> 
> Signed-off-by: Elena Ufimtseva <ufimtseva@xxxxxxxxx>


Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>

but I am no English native speaker on the docs part so it might
be a good thing for such a person to look over it.


> ---
>  docs/man/xl.cfg.pod.5    |   77 +++++++++
>  tools/libxl/xl_cmdimpl.c |  433 
> ++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 510 insertions(+)
> 
> diff --git a/docs/man/xl.cfg.pod.5 b/docs/man/xl.cfg.pod.5
> index f1fc906..2ee2cbc 100644
> --- a/docs/man/xl.cfg.pod.5
> +++ b/docs/man/xl.cfg.pod.5
> @@ -264,6 +264,83 @@ if the values of B<memory=> and B<maxmem=> differ.
>  A "pre-ballooned" HVM guest needs a balloon driver, without a balloon driver
>  it will crash.
>  
> +=item B<vnuma_nodes=N>
> +
> +Number of vNUMA nodes the guest will be initialized with on boot.
> +PV guest by default will have one vnuma node.
> +
> +=item B<vnuma_mem=[vmem1, vmem2, ...]>
> +
> +List of memory sizes for each node, defined in MBytes. Number of items 
> listed must
> +match nr_vnodes. If the sum of all vnode memories does not match the domain 
> memory
> +or there are missing nodes, it will fail.
> +If not specified, memory will be equally split between vnodes. Current 
> minimum
> +memory size for one node is limited by 32MB.
> +
> +Example: vnuma_mem=[1024, 1024, 2048, 2048]
> +Total amount of memory in guest: 6GB
> +
> +=item B<vdistance=[d1, d2]>
> +
> +Defines the distance table for vNUMA nodes. NUMA topology distances are
> +represented by two dimensional square matrix. One element of it [i,j] is
> +a distance between nodes i and j. Trivial case is where all diagonal elements
> +are equal and matrix is symmetrical. vdistance configuration option allows
> +to define two values d1 and d2. d1 will be used for all diagonal elements of
> +distance matrix. All other values will be equal to d2 value. Usually 
> distances
> +are multiple of 10 in Linux and same rule used here.
> +If not specified, the default constants values will be used for distance,
> +e.g. [10, 20]. For one node default distance is [10];
> +
> +Examples:
> +vnodes = 3
> +vdistance=[10, 20]
> +will create this distance table (this is default setting as well):
> +[10, 20, 20]
> +[20, 10, 20]
> +[20, 20, 10]
> +
> +=item B<vnuma_vcpumap=[node_nr, node_nr, ...]>
> +
> +Defines vcpu to vnode mapping as a list of integers. The position in the list
> +is a vcpu number, and the value is the vnode number to which the vcpu will be
> +assigned to.
> +Current limitations:
> +- vNUMA node must have at least one vcpu, otherwise default vcpu_to_vnode 
> will be used.
> +- Total number of vnodes cannot be bigger then number of vcpus.
> +
> +Example:
> +Map of 4 vcpus to 2 vnodes:
> +0,1 vcpu -> vnode0
> +2,3 vcpu -> vnode1:
> +
> +vnuma_vcpumap = [0, 0, 1, 1]
> + 4 vcpus here -  0  1  2  3
> +
> +=item B<vnuma_vnodemap=[p1, p2, ..., pn]>
> +
> +List of physical node numbers, position in the list represents vnode number.
> +Used for manual placement of vnuma nodes to physical NUMA nodes.
> +Will not be used if automatic numa placement is active.
> +
> +Example:
> +assume NUMA machine with 4 physical nodes. Placing vnuma node 0 to pnode 2,
> +vnuma node 1 to pnode 3:
> +vnode0 -> pnode2
> +vnode1 -> pnode3
> +
> +vnuma_vnodemap=[2, 3]
> +first vnode will be placed on node 2, second on node 3.
> +
> +=item B<vnuma_autoplacement=[0|1]>
> +
> +If set to 1 and automatic NUMA placement is enabled, automatically will find 
> the best
> +physical node to place vnuma nodes on. vnuma_vnodemap will be ignored. 
> Automatic NUMA
> +placement is enabled if domain has no pinned cpus.
> +If vnuma_autoplacement is set to 0, then the vnodes will be placed on NUMA 
> nodes set
> +in vnuma_vnodemap if there is enough memory on physical nodes. If not, then 
> the allocation
> +will be made on any of the available node and be placed on multiple physical 
> NUMA nodes.
> +
>  =back
>  
>  =head3 Event Actions
> diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
> index 409a795..1af2250 100644
> --- a/tools/libxl/xl_cmdimpl.c
> +++ b/tools/libxl/xl_cmdimpl.c
> @@ -40,6 +40,7 @@
>  #include "libxl_json.h"
>  #include "libxlutil.h"
>  #include "xl.h"
> +#include "libxl_vnuma.h"
>  
>  /* For calls which return an errno on failure */
>  #define CHK_ERRNOVAL( call ) ({                                         \
> @@ -797,6 +798,432 @@ static void parse_vcpu_affinity(libxl_domain_build_info 
> *b_info,
>      }
>  }
>  
> +static unsigned int get_list_item_uint(XLU_ConfigList *list, unsigned int i)
> +{
> +    const char *buf;
> +    char *ep;
> +    unsigned long ul;
> +    int rc = -EINVAL;
> +
> +    buf = xlu_cfg_get_listitem(list, i);
> +    if (!buf)
> +        return rc;
> +    ul = strtoul(buf, &ep, 10);
> +    if (ep == buf)
> +        return rc;
> +    if (ul >= UINT16_MAX)
> +        return rc;
> +    return (unsigned int)ul;
> +}
> +
> +static void vdistance_set(unsigned int *vdistance,
> +                                unsigned int nr_vnodes,
> +                                unsigned int samenode,
> +                                unsigned int othernode)
> +{
> +    unsigned int idx, slot;
> +    for (idx = 0; idx < nr_vnodes; idx++)
> +        for (slot = 0; slot < nr_vnodes; slot++)
> +            *(vdistance + slot * nr_vnodes + idx) =
> +                idx == slot ? samenode : othernode;
> +}
> +
> +static void vcputovnode_default(unsigned int *cpu_to_node,
> +                                unsigned int nr_vnodes,
> +                                unsigned int max_vcpus)
> +{
> +    unsigned int cpu;
> +    for (cpu = 0; cpu < max_vcpus; cpu++)
> +        cpu_to_node[cpu] = cpu % nr_vnodes;
> +}
> +
> +/* Split domain memory between vNUMA nodes equally. */
> +static int split_vnumamem(libxl_domain_build_info *b_info)
> +{
> +    unsigned long long vnodemem = 0;
> +    unsigned long n;
> +    unsigned int i;
> +
> +    if (b_info->vnodes == 0)
> +        return -1;
> +
> +    vnodemem = (b_info->max_memkb >> 10) / b_info->vnodes;
> +    if (vnodemem < MIN_VNODE_SIZE)
> +        return -1;
> +    /* reminder in MBytes. */
> +    n = (b_info->max_memkb >> 10) % b_info->vnodes;
> +    /* get final sizes in MBytes. */
> +    for (i = 0; i < (b_info->vnodes - 1); i++)
> +        b_info->vnuma_mem[i] = vnodemem;
> +    /* add the reminder to the last node. */
> +    b_info->vnuma_mem[i] = vnodemem + n;
> +    return 0;
> +}
> +
> +static void vnuma_vnodemap_default(unsigned int *vnuma_vnodemap,
> +                                   unsigned int nr_vnodes)
> +{
> +    unsigned int i;
> +    for (i = 0; i < nr_vnodes; i++)
> +        vnuma_vnodemap[i] = VNUMA_NO_NODE;
> +}
> +
> +/*
> + * init vNUMA to "zero config" with one node and all other
> + * topology parameters set to default.
> + */
> +static int vnuma_default_config(libxl_domain_build_info *b_info)
> +{
> +    b_info->vnodes = 1;
> +    /* all memory goes to this one vnode, as well as vcpus. */
> +    if (!(b_info->vnuma_mem = (uint64_t *)calloc(b_info->vnodes,
> +                                sizeof(*b_info->vnuma_mem))))
> +        goto bad_vnumazerocfg;
> +
> +    if (!(b_info->vnuma_vcpumap = (unsigned int *)calloc(b_info->max_vcpus,
> +                                sizeof(*b_info->vnuma_vcpumap))))
> +        goto bad_vnumazerocfg;
> +
> +    if (!(b_info->vdistance = (unsigned int *)calloc(b_info->vnodes *
> +                                b_info->vnodes, sizeof(*b_info->vdistance))))
> +        goto bad_vnumazerocfg;
> +
> +    if (!(b_info->vnuma_vnodemap = (unsigned int *)calloc(b_info->vnodes,
> +                                sizeof(*b_info->vnuma_vnodemap))))
> +        goto bad_vnumazerocfg;
> +
> +    b_info->vnuma_mem[0] = b_info->max_memkb >> 10;
> +
> +    /* all vcpus assigned to this vnode. */
> +    vcputovnode_default(b_info->vnuma_vcpumap, b_info->vnodes,
> +                        b_info->max_vcpus);
> +
> +    /* default vdistance is 10. */
> +    vdistance_set(b_info->vdistance, b_info->vnodes, 10, 10);
> +
> +    /* VNUMA_NO_NODE for vnode_to_pnode. */
> +    vnuma_vnodemap_default(b_info->vnuma_vnodemap, b_info->vnodes);
> +
> +    /*
> +     * will be placed to some physical nodes defined by automatic
> +     * numa placement or VNUMA_NO_NODE will not request exact node.
> +     */
> +    libxl_defbool_set(&b_info->vnuma_autoplacement, true);
> +    return 0;
> +
> + bad_vnumazerocfg:
> +    return -1;
> +}
> +
> +static void free_vnuma_info(libxl_domain_build_info *b_info)
> +{
> +    free(b_info->vnuma_mem);
> +    free(b_info->vdistance);
> +    free(b_info->vnuma_vcpumap);
> +    free(b_info->vnuma_vnodemap);
> +
> +    b_info->vnuma_mem = NULL;
> +    b_info->vdistance = NULL;
> +    b_info->vnuma_vcpumap = NULL;
> +    b_info->vnuma_vnodemap = NULL;
> +
> +    b_info->vnodes = 0;
> +    b_info->vmemranges = 0;
> +}
> +
> +static int parse_vnuma_mem(XLU_Config *config,
> +                            libxl_domain_build_info **b_info)
> +{
> +    libxl_domain_build_info *dst;
> +    XLU_ConfigList *vnumamemcfg;
> +    int nr_vnuma_regions, i;
> +    unsigned long long vnuma_memparsed = 0;
> +    unsigned long ul;
> +    const char *buf;
> +    char *ep;
> +
> +    dst = *b_info;
> +    if (!xlu_cfg_get_list(config, "vnuma_mem",
> +                          &vnumamemcfg, &nr_vnuma_regions, 0)) {
> +
> +        if (nr_vnuma_regions != dst->vnodes) {
> +            fprintf(stderr, "Number of numa regions (vnumamem = %d) is \
> +                    incorrect (should be %d).\n", nr_vnuma_regions,
> +                    dst->vnodes);
> +            goto bad_vnuma_mem;
> +        }
> +
> +        dst->vnuma_mem = calloc(dst->vnodes,
> +                                 sizeof(*dst->vnuma_mem));
> +        if (dst->vnuma_mem == NULL) {
> +            fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n");
> +            goto bad_vnuma_mem;
> +        }
> +
> +        /*
> +         * Will parse only nr_vnodes times, even if we have more/less 
> regions.
> +         * Take care of it later if less or discard if too many regions.
> +         */
> +        for (i = 0; i < dst->vnodes; i++) {
> +            buf = xlu_cfg_get_listitem(vnumamemcfg, i);
> +            if (!buf) {
> +                fprintf(stderr,
> +                        "xl: Unable to get element %d in vnuma memory 
> list.\n", i);
> +                goto bad_vnuma_mem;
> +            }
> +
> +            ul = strtoul(buf, &ep, 10);
> +            if (ep == buf) {
> +                fprintf(stderr, "xl: Invalid argument parsing vnumamem: 
> %s.\n", buf);
> +                goto bad_vnuma_mem;
> +            }
> +
> +            /* 32Mb is a min size for a node, taken from Linux */
> +            if (ul >= UINT32_MAX || ul < MIN_VNODE_SIZE) {
> +                fprintf(stderr, "xl: vnuma memory %lu is not within %u - %u 
> range.\n",
> +                        ul, MIN_VNODE_SIZE, UINT32_MAX);
> +                goto bad_vnuma_mem;
> +            }
> +
> +            /* memory in MBytes */
> +            dst->vnuma_mem[i] = ul;
> +        }
> +
> +        /* Total memory for vNUMA parsed to verify */
> +        for (i = 0; i < nr_vnuma_regions; i++)
> +            vnuma_memparsed = vnuma_memparsed + (dst->vnuma_mem[i]);
> +
> +        /* Amount of memory for vnodes same as total? */
> +        if ((vnuma_memparsed << 10) != (dst->max_memkb)) {
> +            fprintf(stderr, "xl: vnuma memory is not the same as domain \
> +                    memory size.\n");
> +            goto bad_vnuma_mem;
> +        }
> +    } else {
> +        dst->vnuma_mem = calloc(dst->vnodes,
> +                                      sizeof(*dst->vnuma_mem));
> +        if (dst->vnuma_mem == NULL) {
> +            fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n");
> +            goto bad_vnuma_mem;
> +        }
> +
> +        fprintf(stderr, "WARNING: vNUMA memory ranges were not 
> specified.\n");
> +        fprintf(stderr, "Using default equal vnode memory size %lu Kbytes \
> +                to cover %lu Kbytes.\n",
> +                dst->max_memkb / dst->vnodes, dst->max_memkb);
> +
> +        if (split_vnumamem(dst) < 0) {
> +            fprintf(stderr, "Could not split vnuma memory into equal 
> chunks.\n");
> +            goto bad_vnuma_mem;
> +        }
> +    }
> +    return 0;
> +
> + bad_vnuma_mem:
> +    return -1;
> +}
> +
> +static int parse_vnuma_distance(XLU_Config *config,
> +                                libxl_domain_build_info **b_info)
> +{
> +    libxl_domain_build_info *dst;
> +    XLU_ConfigList *vdistancecfg;
> +    int nr_vdist;
> +
> +    dst = *b_info;
> +    dst->vdistance = calloc(dst->vnodes * dst->vnodes,
> +                               sizeof(*dst->vdistance));
> +    if (dst->vdistance == NULL)
> +        goto bad_distance;
> +
> +    if (!xlu_cfg_get_list(config, "vdistance", &vdistancecfg, &nr_vdist, 0)) 
> {
> +        int d1, d2, i;
> +        /*
> +         * First value is the same node distance, the second as the
> +         * rest of distances. The following is required right now to
> +         * avoid non-symmetrical distance table as it may break latest 
> kernel.
> +         * TODO: Better way to analyze extended distance table, possibly
> +         * OS specific.
> +         */
> +
> +        for (i = 0; i < nr_vdist; i++) {
> +            d1 = get_list_item_uint(vdistancecfg, i);
> +        }
> +
> +        d1 = get_list_item_uint(vdistancecfg, 0);
> +        if (dst->vnodes > 1)
> +           d2 = get_list_item_uint(vdistancecfg, 1);
> +        else
> +           d2 = d1;
> +
> +        if (d1 >= 0 && d2 >= 0) {
> +            if (d1 < d2)
> +                fprintf(stderr, "WARNING: vnuma distance d1 < d2, %u < 
> %u\n", d1, d2);
> +            vdistance_set(dst->vdistance, dst->vnodes, d1, d2);
> +        } else {
> +            fprintf(stderr, "WARNING: vnuma distance values are 
> incorrect.\n");
> +            goto bad_distance;
> +        }
> +    } else {
> +        fprintf(stderr, "Could not parse vnuma distances.\n");
> +        vdistance_set(dst->vdistance, dst->vnodes, 10, 20);
> +    }
> +    return 0;
> +
> + bad_distance:
> +    return -1;
> +}
> +
> +static int parse_vnuma_vcpumap(XLU_Config *config,
> +                                libxl_domain_build_info **b_info)
> +{
> +    libxl_domain_build_info *dst;
> +    XLU_ConfigList *vcpumap;
> +    int nr_vcpumap, i;
> +
> +    dst = *b_info;
> +    dst->vnuma_vcpumap = (unsigned int *)calloc(dst->max_vcpus,
> +                                     sizeof(*dst->vnuma_vcpumap));
> +    if (dst->vnuma_vcpumap == NULL)
> +        goto bad_vcpumap;
> +
> +    if (!xlu_cfg_get_list(config, "vnuma_vcpumap",
> +                          &vcpumap, &nr_vcpumap, 0)) {
> +        if (nr_vcpumap == dst->max_vcpus) {
> +            unsigned int  vnode, vcpumask = 0, vmask;
> +
> +            vmask = ~(~0 << nr_vcpumap);
> +            for (i = 0; i < nr_vcpumap; i++) {
> +                vnode = get_list_item_uint(vcpumap, i);
> +                if (vnode >= 0 && vnode < dst->vnodes) {
> +                    vcpumask |= (1 << i);
> +                    dst->vnuma_vcpumap[i] = vnode;
> +                }
> +            }
> +
> +            /* Did it covered all vnodes in the vcpu mask? */
> +            if ( !(((vmask & vcpumask) + 1) == (1 << nr_vcpumap)) ) {
> +                fprintf(stderr, "WARNING: Not all vnodes were covered \
> +                        in numa_cpumask.\n");
> +                goto bad_vcpumap;
> +            }
> +        } else {
> +            fprintf(stderr, "WARNING:  Bad vnuma_vcpumap.\n");
> +            goto bad_vcpumap;
> +        }
> +    }
> +    else
> +        vcputovnode_default(dst->vnuma_vcpumap,
> +                            dst->vnodes,
> +                            dst->max_vcpus);
> +    return 0;
> +
> + bad_vcpumap:
> +    return -1;
> +}
> +
> +static int parse_vnuma_vnodemap(XLU_Config *config,
> +                                libxl_domain_build_info **b_info)
> +{
> +    libxl_domain_build_info *dst;
> +    XLU_ConfigList *vnodemap;
> +    int nr_vnodemap, i;
> +
> +    dst = *b_info;
> +
> +    /* There is mapping to NUMA physical nodes? */
> +    dst->vnuma_vnodemap = (unsigned int *)calloc(dst->vnodes,
> +                           sizeof(*dst->vnuma_vnodemap));
> +    if (dst->vnuma_vnodemap == NULL)
> +        goto bad_vnodemap;
> +
> +    if (!xlu_cfg_get_list(config, "vnuma_vnodemap",
> +                          &vnodemap, &nr_vnodemap, 0)) {
> +        /*
> +         * If not specified or incorrect, will be defined
> +         * later based on the machine architecture, configuration
> +         * and memory availble when creating domain.
> +         */
> +        libxl_defbool_set(&dst->vnuma_autoplacement, false);
> +        if (nr_vnodemap == dst->vnodes) {
> +            unsigned int vnodemask = 0, pnode, smask;
> +            smask = ~(~0 << dst->vnodes);
> +            for (i = 0; i < dst->vnodes; i++) {
> +                pnode = get_list_item_uint(vnodemap, i);
> +                if (pnode >= 0) {
> +                    vnodemask |= (1 << i);
> +                    dst->vnuma_vnodemap[i] = pnode;
> +                }
> +            }
> +
> +            /* Did it covered all vnodes in the mask? */
> +            if ( !(((vnodemask & smask) + 1) == (1 << nr_vnodemap)) ) {
> +                fprintf(stderr, "WARNING: Not all vnodes were covered \
> +                        vnuma_vnodemap.\n");
> +                fprintf(stderr, "Automatic placement will be used for 
> vnodes.\n");
> +                libxl_defbool_set(&dst->vnuma_autoplacement, true);
> +                vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
> +            }
> +        }
> +        else {
> +            fprintf(stderr, "WARNING: Incorrect vnuma_vnodemap.\n");
> +            fprintf(stderr, "Automatic placement will be used for 
> vnodes.\n");
> +            libxl_defbool_set(&dst->vnuma_autoplacement, true);
> +            vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
> +        }
> +    }
> +    else {
> +        fprintf(stderr, "WARNING: Missing vnuma_vnodemap.\n");
> +        fprintf(stderr, "Automatic placement will be used for vnodes.\n");
> +        libxl_defbool_set(&dst->vnuma_autoplacement, true);
> +        vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
> +    }
> +    return 0;
> +
> + bad_vnodemap:
> +    return -1;
> +
> +}
> +
> +static void parse_vnuma_config(XLU_Config *config,
> +                               libxl_domain_build_info *b_info)
> +{
> +    long l;
> +
> +    if (!xlu_cfg_get_long (config, "vnodes", &l, 0)) {
> +        if (l > MAX_VNUMA_NODES) {
> +            fprintf(stderr, "Too many vnuma nodes, max %d is allowed.\n",
> +                    MAX_VNUMA_NODES);
> +            goto bad_vnuma_config;
> +        }
> +        b_info->vnodes = l;
> +
> +        if (!xlu_cfg_get_defbool(config, "vnuma_autoplacement",
> +                    &b_info->vnuma_autoplacement, 0))
> +            libxl_defbool_set(&b_info->vnuma_autoplacement, false);
> +
> +        /* Only construct nodes with at least one vcpu. */
> +        if (b_info->vnodes != 0 && b_info->max_vcpus >= b_info->vnodes) {
> +            if (parse_vnuma_mem(config, &b_info) ||
> +                parse_vnuma_distance(config, &b_info) ||
> +                parse_vnuma_vcpumap(config, &b_info) ||
> +                parse_vnuma_vnodemap(config, &b_info))
> +                goto bad_vnuma_config;
> +        }
> +        else if (vnuma_default_config(b_info))
> +            goto bad_vnuma_config;
> +    }
> +    /* If vnuma topology is not defined for domain, init one node */
> +    else if (vnuma_default_config(b_info))
> +            goto bad_vnuma_config;
> +    return;
> +
> + bad_vnuma_config:
> +    fprintf(stderr, "Failed to parse vnuma config or set default vnuma 
> config.\n");
> +    free_vnuma_info(b_info);
> +    exit(1);
> +}
> +
>  static void parse_config_data(const char *config_source,
>                                const char *config_data,
>                                int config_len,
> @@ -924,6 +1351,12 @@ static void parse_config_data(const char *config_source,
>  
>      libxl_defbool_set(&b_info->claim_mode, claim_mode);
>  
> +    /*
> +     * If there is no vnuma in config, "zero" vnuma config
> +     * will be initialized with one node and other defaults.
> +     */
> +    parse_vnuma_config(config, b_info);
> +
>      if (xlu_cfg_get_string (config, "on_poweroff", &buf, 0))
>          buf = "destroy";
>      if (!parse_action_on_shutdown(buf, &d_config->on_poweroff)) {
> -- 
> 1.7.10.4
> 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.