|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH RESEND v7 9/9] libxl: vnuma topology configuration parser and doc
On Thu, Aug 21, 2014 at 1:14 AM, Elena Ufimtseva <ufimtseva@xxxxxxxxx> wrote:
> Parses vnuma topoplogy number of nodes and memory
> ranges. If not defined, initializes vnuma with
> only one node and default topology. This one node covers
> all domain memory and all vcpus assigned to it.
>
> Signed-off-by: Elena Ufimtseva <ufimtseva@xxxxxxxxx>
> ---
> docs/man/xl.cfg.pod.5 | 77 +++++++++
> tools/libxl/xl_cmdimpl.c | 425
> ++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 502 insertions(+)
>
> diff --git a/docs/man/xl.cfg.pod.5 b/docs/man/xl.cfg.pod.5
> index 1e04eed..073664e 100644
> --- a/docs/man/xl.cfg.pod.5
> +++ b/docs/man/xl.cfg.pod.5
> @@ -264,6 +264,83 @@ if the values of B<memory=> and B<maxmem=> differ.
> A "pre-ballooned" HVM guest needs a balloon driver, without a balloon driver
> it will crash.
>
> +=item B<vnuma_nodes=N>
> +
> +Number of vNUMA nodes the guest will be initialized with on boot.
> +PV guest by default will have one vnuma node.
> +
> +=item B<vnuma_mem=[vmem1, vmem2, ...]>
> +
> +List of memory sizes for each node, defined in MBytes. Number of items
> listed must
> +match nr_vnodes. If the sum of all vnode memories does not match the domain
> memory
> +or there are missing nodes, it will fail.
> +If not specified, memory will be equally split between vnodes. Current
> minimum
> +memory size for one node is limited by 32MB.
> +
> +Example: vnuma_mem=[1024, 1024, 2048, 2048]
> +Total amount of memory in guest: 6GB
> +
> +=item B<vdistance=[d1, d2]>
> +
> +Defines the distance table for vNUMA nodes. NUMA topology distances are
> +represented by two dimensional square matrix. One element of it [i,j] is
> +a distance between nodes i and j. Trivial case is where all diagonal elements
> +are equal and matrix is symmetrical. vdistance configuration option allows
> +to define two values d1 and d2. d1 will be used for all diagonal elements of
> +distance matrix. All other values will be equal to d2 value. Usually
> distances
> +are multiple of 10 in Linux and same rule used here.
> +If not specified, the default constants values will be used for distance,
> +e.g. [10, 20]. For one node default distance is [10];
> +
> +Examples:
> +vnodes = 3
> +vdistance=[10, 20]
> +will create this distance table (this is default setting as well):
> +[10, 20, 20]
> +[20, 10, 20]
> +[20, 20, 10]
> +
> +=item B<vnuma_vcpumap=[node_nr, node_nr, ...]>
> +
> +Defines vcpu to vnode mapping as a list of integers. The position in the list
> +is a vcpu number, and the value is the vnode number to which the vcpu will be
> +assigned to.
> +Current limitations:
> +- vNUMA node must have at least one vcpu, otherwise default vcpu_to_vnode
> will be used.
> +- Total number of vnodes cannot be bigger then number of vcpus.
> +
> +Example:
> +Map of 4 vcpus to 2 vnodes:
> +0,1 vcpu -> vnode0
> +2,3 vcpu -> vnode1:
> +
> +vnuma_vcpumap = [0, 0, 1, 1]
> + 4 vcpus here - 0 1 2 3
> +
> +=item B<vnuma_vnodemap=[p1, p2, ..., pn]>
> +
> +List of physical node numbers, position in the list represents vnode number.
> +Used for manual placement of vnuma nodes to physical NUMA nodes.
> +Will not be used if automatic numa placement is active.
> +
> +Example:
> +assume NUMA machine with 4 physical nodes. Placing vnuma node 0 to pnode 2,
> +vnuma node 1 to pnode 3:
> +vnode0 -> pnode2
> +vnode1 -> pnode3
> +
> +vnuma_vnodemap=[2, 3]
> +first vnode will be placed on node 2, second on node 3.
> +
> +=item B<vnuma_autoplacement=[0|1]>
> +
> +If set to 1 and automatic NUMA placement is enabled, automatically will find
> the best
> +physical node to place vnuma nodes on. vnuma_vnodemap will be ignored.
> Automatic NUMA
> +placement is enabled if domain has no pinned cpus.
> +If vnuma_autoplacement is set to 0, then the vnodes will be placed on NUMA
> nodes set
> +in vnuma_vnodemap if there is enough memory on physical nodes. If not, then
> the allocation
> +will be made on any of the available node and be placed on multiple physical
> NUMA nodes.
> +
> =back
>
> =head3 Event Actions
> diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
> index f1c136a..4cf5302 100644
> --- a/tools/libxl/xl_cmdimpl.c
> +++ b/tools/libxl/xl_cmdimpl.c
> @@ -40,6 +40,7 @@
> #include "libxl_json.h"
> #include "libxlutil.h"
> #include "xl.h"
> +#include "libxl_vnuma.h"
>
> /* For calls which return an errno on failure */
> #define CHK_ERRNOVAL( call ) ({ \
> @@ -766,6 +767,423 @@ static void parse_vcpu_affinity(libxl_domain_build_info
> *b_info,
> }
> }
>
> +static unsigned int get_list_item_uint(XLU_ConfigList *list, unsigned int i)
> +{
> + const char *buf;
> + char *ep;
> + unsigned long ul;
> + int rc = -EINVAL;
> +
> + buf = xlu_cfg_get_listitem(list, i);
> + if (!buf)
> + return rc;
> + ul = strtoul(buf, &ep, 10);
> + if (ep == buf)
> + return rc;
> + if (ul >= UINT16_MAX)
> + return rc;
> + return (unsigned int)ul;
> +}
> +
> +static void vdistance_set(unsigned int *vdistance,
> + unsigned int nr_vnodes,
> + unsigned int samenode,
> + unsigned int othernode)
> +{
> + unsigned int idx, slot;
> + for (idx = 0; idx < nr_vnodes; idx++)
> + for (slot = 0; slot < nr_vnodes; slot++)
> + *(vdistance + slot * nr_vnodes + idx) =
> + idx == slot ? samenode : othernode;
> +}
> +
> +static void vcputovnode_default(unsigned int *cpu_to_node,
> + unsigned int nr_vnodes,
> + unsigned int max_vcpus)
> +{
> + unsigned int cpu;
> + for (cpu = 0; cpu < max_vcpus; cpu++)
> + cpu_to_node[cpu] = cpu % nr_vnodes;
> +}
> +
> +/* Split domain memory between vNUMA nodes equally. */
> +static int split_vnumamem(libxl_domain_build_info *b_info)
> +{
> + unsigned long long vnodemem = 0;
> + unsigned long n;
> + unsigned int i;
> +
> + if (b_info->vnodes == 0)
> + return -1;
> +
> + vnodemem = (b_info->max_memkb >> 10) / b_info->vnodes;
> + if (vnodemem < MIN_VNODE_SIZE)
> + return -1;
> + /* reminder in MBytes. */
> + n = (b_info->max_memkb >> 10) % b_info->vnodes;
> + /* get final sizes in MBytes. */
> + for (i = 0; i < (b_info->vnodes - 1); i++)
> + b_info->vnuma_mem[i] = vnodemem;
> + /* add the reminder to the last node. */
> + b_info->vnuma_mem[i] = vnodemem + n;
> + return 0;
> +}
> +
> +static void vnuma_vnodemap_default(unsigned int *vnuma_vnodemap,
> + unsigned int nr_vnodes)
> +{
> + unsigned int i;
> + for (i = 0; i < nr_vnodes; i++)
> + vnuma_vnodemap[i] = VNUMA_NO_NODE;
> +}
> +
> +/*
> + * init vNUMA to "zero config" with one node and all other
> + * topology parameters set to default.
> + */
> +static int vnuma_default_config(libxl_domain_build_info *b_info)
> +{
> + b_info->vnodes = 1;
> + /* all memory goes to this one vnode, as well as vcpus. */
> + if (!(b_info->vnuma_mem = (uint64_t *)calloc(b_info->vnodes,
> + sizeof(*b_info->vnuma_mem))))
> + goto bad_vnumazerocfg;
> +
> + if (!(b_info->vnuma_vcpumap = (unsigned int *)calloc(b_info->max_vcpus,
> + sizeof(*b_info->vnuma_vcpumap))))
> + goto bad_vnumazerocfg;
> +
> + if (!(b_info->vdistance = (unsigned int *)calloc(b_info->vnodes *
> + b_info->vnodes, sizeof(*b_info->vdistance))))
> + goto bad_vnumazerocfg;
> +
> + if (!(b_info->vnuma_vnodemap = (unsigned int *)calloc(b_info->vnodes,
> + sizeof(*b_info->vnuma_vnodemap))))
> + goto bad_vnumazerocfg;
> +
> + b_info->vnuma_mem[0] = b_info->max_memkb >> 10;
> +
> + /* all vcpus assigned to this vnode. */
> + vcputovnode_default(b_info->vnuma_vcpumap, b_info->vnodes,
> + b_info->max_vcpus);
> +
> + /* default vdistance is 10. */
> + vdistance_set(b_info->vdistance, b_info->vnodes, 10, 10);
> +
> + /* VNUMA_NO_NODE for vnode_to_pnode. */
> + vnuma_vnodemap_default(b_info->vnuma_vnodemap, b_info->vnodes);
> +
> + /*
> + * will be placed to some physical nodes defined by automatic
> + * numa placement or VNUMA_NO_NODE will not request exact node.
> + */
> + libxl_defbool_set(&b_info->vnuma_autoplacement, true);
> + return 0;
> +
> + bad_vnumazerocfg:
> + return -1;
> +}
> +
> +static void free_vnuma_info(libxl_domain_build_info *b_info)
> +{
> + free(b_info->vnuma_mem);
> + free(b_info->vdistance);
> + free(b_info->vnuma_vcpumap);
> + free(b_info->vnuma_vnodemap);
> + b_info->vnodes = 0;
> +}
> +
> +static int parse_vnuma_mem(XLU_Config *config,
> + libxl_domain_build_info **b_info)
> +{
> + libxl_domain_build_info *dst;
> + XLU_ConfigList *vnumamemcfg;
> + int nr_vnuma_regions, i;
> + unsigned long long vnuma_memparsed = 0;
> + unsigned long ul;
> + const char *buf;
> +
> + dst = *b_info;
> + if (!xlu_cfg_get_list(config, "vnuma_mem",
> + &vnumamemcfg, &nr_vnuma_regions, 0)) {
> +
> + if (nr_vnuma_regions != dst->vnodes) {
> + fprintf(stderr, "Number of numa regions (vnumamem = %d) is \
> + incorrect (should be %d).\n", nr_vnuma_regions,
> + dst->vnodes);
> + goto bad_vnuma_mem;
> + }
> +
> + dst->vnuma_mem = calloc(dst->vnodes,
> + sizeof(*dst->vnuma_mem));
> + if (dst->vnuma_mem == NULL) {
> + fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n");
> + goto bad_vnuma_mem;
> + }
> +
> + char *ep;
> + /*
> + * Will parse only nr_vnodes times, even if we have more/less
> regions.
> + * Take care of it later if less or discard if too many regions.
> + */
> + for (i = 0; i < dst->vnodes; i++) {
> + buf = xlu_cfg_get_listitem(vnumamemcfg, i);
> + if (!buf) {
> + fprintf(stderr,
> + "xl: Unable to get element %d in vnuma memory
> list.\n", i);
> + goto bad_vnuma_mem;
> + }
> +
> + ul = strtoul(buf, &ep, 10);
> + if (ep == buf) {
> + fprintf(stderr, "xl: Invalid argument parsing vnumamem:
> %s.\n", buf);
> + goto bad_vnuma_mem;
> + }
> +
> + /* 32Mb is a min size for a node, taken from Linux */
> + if (ul >= UINT32_MAX || ul < MIN_VNODE_SIZE) {
> + fprintf(stderr, "xl: vnuma memory %lu is not within %u - %u
> range.\n",
> + ul, MIN_VNODE_SIZE, UINT32_MAX);
> + goto bad_vnuma_mem;
> + }
> +
> + /* memory in MBytes */
> + dst->vnuma_mem[i] = ul;
> + }
> +
> + /* Total memory for vNUMA parsed to verify */
> + for (i = 0; i < nr_vnuma_regions; i++)
> + vnuma_memparsed = vnuma_memparsed + (dst->vnuma_mem[i]);
> +
> + /* Amount of memory for vnodes same as total? */
> + if ((vnuma_memparsed << 10) != (dst->max_memkb)) {
> + fprintf(stderr, "xl: vnuma memory is not the same as domain \
> + memory size.\n");
> + goto bad_vnuma_mem;
> + }
> + } else {
> + dst->vnuma_mem = calloc(dst->vnodes,
> + sizeof(*dst->vnuma_mem));
> + if (dst->vnuma_mem == NULL) {
> + fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n");
> + goto bad_vnuma_mem;
> + }
> +
> + fprintf(stderr, "WARNING: vNUMA memory ranges were not
> specified.\n");
> + fprintf(stderr, "Using default equal vnode memory size %lu Kbytes \
> + to cover %lu Kbytes.\n",
> + dst->max_memkb / dst->vnodes, dst->max_memkb);
> +
> + if (split_vnumamem(dst) < 0) {
> + fprintf(stderr, "Could not split vnuma memory into equal
> chunks.\n");
> + goto bad_vnuma_mem;
> + }
> + }
> + return 0;
> +
> + bad_vnuma_mem:
> + return -1;
> +}
> +
> +static int parse_vnuma_distance(XLU_Config *config,
> + libxl_domain_build_info **b_info)
> +{
> + libxl_domain_build_info *dst;
> + XLU_ConfigList *vdistancecfg;
> + int nr_vdist;
> +
> + dst = *b_info;
> + dst->vdistance = calloc(dst->vnodes * dst->vnodes,
> + sizeof(*dst->vdistance));
> + if (dst->vdistance == NULL)
> + goto bad_distance;
> +
> + if (!xlu_cfg_get_list(config, "vdistance", &vdistancecfg, &nr_vdist, 0))
> {
> + int d1, d2, i;
> + /*
> + * First value is the same node distance, the second as the
> + * rest of distances. The following is required right now to
> + * avoid non-symmetrical distance table as it may break latest
> kernel.
> + * TODO: Better way to analyze extended distance table, possibly
> + * OS specific.
> + */
> +
> + for (i = 0; i < nr_vdist; i++) {
> + d1 = get_list_item_uint(vdistancecfg, i);
> + }
> +
> + d1 = get_list_item_uint(vdistancecfg, 0);
> + if (dst->vnodes > 1)
> + d2 = get_list_item_uint(vdistancecfg, 1);
> + else
> + d2 = d1;
> +
> + if (d1 >= 0 && d2 >= 0) {
> + if (d1 < d2)
> + fprintf(stderr, "WARNING: vnuma distance d1 < d2, %u <
> %u\n", d1, d2);
> + vdistance_set(dst->vdistance, dst->vnodes, d1, d2);
> + } else {
> + fprintf(stderr, "WARNING: vnuma distance values are
> incorrect.\n");
> + goto bad_distance;
> + }
> + } else {
> + fprintf(stderr, "Could not parse vnuma distances.\n");
> + vdistance_set(dst->vdistance, dst->vnodes, 10, 20);
> + }
> + return 0;
> +
> + bad_distance:
> + return -1;
> +}
> +
> +static int parse_vnuma_vcpumap(XLU_Config *config,
> + libxl_domain_build_info **b_info)
> +{
> + libxl_domain_build_info *dst;
> + XLU_ConfigList *vcpumap;
> + int nr_vcpumap, i;
> +
> + dst = *b_info;
> + dst->vnuma_vcpumap = (unsigned int *)calloc(dst->max_vcpus,
> + sizeof(*dst->vnuma_vcpumap));
> + if (dst->vnuma_vcpumap == NULL)
> + goto bad_vcpumap;
> +
> + if (!xlu_cfg_get_list(config, "vnuma_vcpumap",
> + &vcpumap, &nr_vcpumap, 0)) {
> + if (nr_vcpumap == dst->max_vcpus) {
> + unsigned int vnode, vcpumask = 0, vmask;
> + vmask = ~(~0 << nr_vcpumap);
> + for (i = 0; i < nr_vcpumap; i++) {
> + vnode = get_list_item_uint(vcpumap, i);
> + if (vnode >= 0 && vnode < dst->vnodes) {
> + vcpumask |= (1 << i);
> + dst->vnuma_vcpumap[i] = vnode;
> + }
> + }
> +
> + /* Did it covered all vnodes in the vcpu mask? */
> + if ( !(((vmask & vcpumask) + 1) == (1 << nr_vcpumap)) ) {
> + fprintf(stderr, "WARNING: Not all vnodes were covered \
> + in numa_cpumask.\n");
> + goto bad_vcpumap;
> + }
> + } else {
> + fprintf(stderr, "WARNING: Bad vnuma_vcpumap.\n");
> + goto bad_vcpumap;
> + }
> + }
> + else
> + vcputovnode_default(dst->vnuma_vcpumap,
> + dst->vnodes,
> + dst->max_vcpus);
> + return 0;
> +
> + bad_vcpumap:
> + return -1;
> +}
> +
> +static int parse_vnuma_vnodemap(XLU_Config *config,
> + libxl_domain_build_info **b_info)
> +{
> + libxl_domain_build_info *dst;
> + XLU_ConfigList *vnodemap;
> + int nr_vnodemap, i;
> +
> + dst = *b_info;
> +
> + /* There is mapping to NUMA physical nodes? */
> + dst->vnuma_vnodemap = (unsigned int *)calloc(dst->vnodes,
> + sizeof(*dst->vnuma_vnodemap));
> + if (dst->vnuma_vnodemap == NULL)
> + goto bad_vnodemap;
> + if (!xlu_cfg_get_list(config, "vnuma_vnodemap",&vnodemap,
> + &nr_vnodemap, 0)) {
> + /*
> + * If not specified or incorred, will be defined
> + * later based on the machine architecture, configuration
> + * and memory availble when creating domain.
> + */
> + libxl_defbool_set(&dst->vnuma_autoplacement, false);
> + if (nr_vnodemap == dst->vnodes) {
> + unsigned int vnodemask = 0, pnode, smask;
> + smask = ~(~0 << dst->vnodes);
> + for (i = 0; i < dst->vnodes; i++) {
> + pnode = get_list_item_uint(vnodemap, i);
> + if (pnode >= 0) {
> + vnodemask |= (1 << i);
> + dst->vnuma_vnodemap[i] = pnode;
> + }
> + }
> +
> + /* Did it covered all vnodes in the mask? */
> + if ( !(((vnodemask & smask) + 1) == (1 << nr_vnodemap)) ) {
> + fprintf(stderr, "WARNING: Not all vnodes were covered \
> + vnuma_vnodemap.\n");
> + fprintf(stderr, "Automatic placement will be used for
> vnodes.\n");
> + libxl_defbool_set(&dst->vnuma_autoplacement, true);
> + vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
> + }
> + }
> + else {
> + fprintf(stderr, "WARNING: Incorrect vnuma_vnodemap.\n");
> + fprintf(stderr, "Automatic placement will be used for
> vnodes.\n");
> + libxl_defbool_set(&dst->vnuma_autoplacement, true);
> + vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
> + }
> + }
> + else {
> + fprintf(stderr, "WARNING: Missing vnuma_vnodemap.\n");
> + fprintf(stderr, "Automatic placement will be used for vnodes.\n");
> + libxl_defbool_set(&dst->vnuma_autoplacement, true);
> + vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->vnodes);
> + }
> + return 0;
> +
> + bad_vnodemap:
> + return -1;
> +
> +}
> +
> +static void parse_vnuma_config(XLU_Config *config,
> + libxl_domain_build_info *b_info)
> +{
> + long l;
> +
> + if (!xlu_cfg_get_long (config, "vnodes", &l, 0)) {
> + if (l > MAX_VNUMA_NODES) {
> + fprintf(stderr, "Too many vnuma nodes, max %d is allowed.\n",
> + MAX_VNUMA_NODES);
> + goto bad_vnuma_config;
> + }
> + b_info->vnodes = l;
> +
> + if (!xlu_cfg_get_defbool(config, "vnuma_autoplacement",
> + &b_info->vnuma_autoplacement, 0))
> + libxl_defbool_set(&b_info->vnuma_autoplacement, false);
> +
> + /* Only construct nodes with at least one vcpu. */
> + if (b_info->vnodes != 0 && b_info->max_vcpus >= b_info->vnodes) {
> + if (parse_vnuma_mem(config, &b_info) ||
> + parse_vnuma_distance(config, &b_info) ||
> + parse_vnuma_vcpumap(config, &b_info) ||
> + parse_vnuma_vnodemap(config, &b_info))
> + goto bad_vnuma_config;
> + }
> + else if (vnuma_default_config(b_info))
> + goto bad_vnuma_config;
> + }
> + /* If vnuma topology is not defined for domain, init one node */
> + else if (vnuma_default_config(b_info))
> + goto bad_vnuma_config;
> + return;
> +
> + bad_vnuma_config:
> + fprintf(stderr, "Failed to parse vnuma config or set default vnuma
> config.\n");
> + free_vnuma_info(b_info);
> + exit(1);
> +}
> +
> static void parse_config_data(const char *config_source,
> const char *config_data,
> int config_len,
> @@ -1063,6 +1481,13 @@ static void parse_config_data(const char
> *config_source,
> exit(1);
> }
>
> +
> + /*
> + * If there is no vnuma in config, "zero" vnuma config
> + * will be initialized with one node and other defaults.
> + */
> + parse_vnuma_config(config, b_info);
> +
> xlu_cfg_replace_string (config, "bootloader",
> &b_info->u.pv.bootloader, 0);
> switch (xlu_cfg_get_list_as_string_list(config, "bootloader_args",
> &b_info->u.pv.bootloader_args, 1))
> --
> 1.7.10.4
>
Apologies for multiple copies of this patch. For some reason patch v7
9/9 failed to display the patch number.
--
Elena
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |