[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v6 05/10] libxl: vnuma topology configuration parser and doc
Parses vnuma topoplogy number of nodes and memory ranges. If not defined, initializes vnuma with only one node and default topology. This one node covers all domain memory and all vcpus assigned to it. Signed-off-by: Elena Ufimtseva <ufimtseva@xxxxxxxxx> --- docs/man/xl.cfg.pod.5 | 77 ++++++++ tools/libxl/libxl_types.idl | 6 +- tools/libxl/libxl_vnuma.h | 8 + tools/libxl/xl_cmdimpl.c | 425 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 515 insertions(+), 1 deletion(-) create mode 100644 tools/libxl/libxl_vnuma.h diff --git a/docs/man/xl.cfg.pod.5 b/docs/man/xl.cfg.pod.5 index ff9ea77..0c7fbf8 100644 --- a/docs/man/xl.cfg.pod.5 +++ b/docs/man/xl.cfg.pod.5 @@ -242,6 +242,83 @@ if the values of B<memory=> and B<maxmem=> differ. A "pre-ballooned" HVM guest needs a balloon driver, without a balloon driver it will crash. +=item B<vnuma_nodes=N> + +Number of vNUMA nodes the guest will be initialized with on boot. +PV guest by default will have one vnuma node. + +=item B<vnuma_mem=[vmem1, vmem2, ...]> + +List of memory sizes for each node, defined in MBytes. Number of items listed must +match nr_vnodes. If the sum of all vnode memories does not match the domain memory +or there are missing nodes, it will fail. +If not specified, memory will be equally split between vnodes. Current minimum +memory size for one node is limited by 32MB. + +Example: vnuma_mem=[1024, 1024, 2048, 2048] +Total amount of memory in guest: 6GB + +=item B<vdistance=[d1, d2]> + +Defines the distance table for vNUMA nodes. NUMA topology distances are +represented by two dimensional square matrix. One element of it [i,j] is +a distance between nodes i and j. Trivial case is where all diagonal elements +are equal and matrix is symmetrical. vdistance configuration option allows +to define two values d1 and d2. d1 will be used for all diagonal elements of +distance matrix. All other values will be equal to d2 value. Usually distances +are multiple of 10 in Linux and same rule used here. +If not specified, the default constants values will be used for distance, +e.g. [10, 20]. For one node default distance is [10]; + +Examples: +vnodes = 3 +vdistance=[10, 20] +will create this distance table (this is default setting as well): +[10, 20, 20] +[20, 10, 20] +[20, 20, 10] + +=item B<vnuma_vcpumap=[node_nr, node_nr, ...]> + +Defines vcpu to vnode mapping as a list of integers. The position in the list +is a vcpu number, and the value is the vnode number to which the vcpu will be +assigned to. +Current limitations: +- vNUMA node must have at least one vcpu, otherwise default vcpu_to_vnode will be used. +- Total number of vnodes cannot be bigger then number of vcpus. + +Example: +Map of 4 vcpus to 2 vnodes: +0,1 vcpu -> vnode0 +2,3 vcpu -> vnode1: + +vnuma_vcpumap = [0, 0, 1, 1] + 4 vcpus here - 0 1 2 3 + +=item B<vnuma_vnodemap=[p1, p2, ..., pn]> + +List of physical node numbers, position in the list represents vnode number. +Used for manual placement of vnuma nodes to physical NUMA nodes. +Will not be used if automatic numa placement is active. + +Example: +assume NUMA machine with 4 physical nodes. Placing vnuma node 0 to pnode 2, +vnuma node 1 to pnode 3: +vnode0 -> pnode2 +vnode1 -> pnode3 + +vnuma_vnodemap=[2, 3] +first vnode will be placed on node 2, second on node 3. + +=item B<vnuma_autoplacement=[0|1]> + +If set to 1 and automatic NUMA placement is enabled, automatically will find the best +physical node to place vnuma nodes on. vnuma_vnodemap will be ignored. Automatic NUMA +placement is enabled if domain has no pinned cpus. +If vnuma_autoplacement is set to 0, then the vnodes will be placed on NUMA nodes set +in vnuma_vnodemap if there is enough memory on physical nodes. If not, then the allocation +will be made on any of the available node and be placed on multiple physical NUMA nodes. + =back =head3 Event Actions diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl index de25f42..5876822 100644 --- a/tools/libxl/libxl_types.idl +++ b/tools/libxl/libxl_types.idl @@ -318,7 +318,11 @@ libxl_domain_build_info = Struct("domain_build_info",[ ("disable_migrate", libxl_defbool), ("cpuid", libxl_cpuid_policy_list), ("blkdev_start", string), - + ("vnuma_mem", Array(uint64, "nr_nodes")), + ("vnuma_vcpumap", Array(uint32, "nr_nodemap")), + ("vdistance", Array(uint32, "nr_dist")), + ("vnuma_vnodemap", Array(uint32, "nr_node_to_pnode")), + ("vnuma_autoplacement", libxl_defbool), ("device_model_version", libxl_device_model_version), ("device_model_stubdomain", libxl_defbool), # if you set device_model you must set device_model_version too diff --git a/tools/libxl/libxl_vnuma.h b/tools/libxl/libxl_vnuma.h new file mode 100644 index 0000000..4ff4c57 --- /dev/null +++ b/tools/libxl/libxl_vnuma.h @@ -0,0 +1,8 @@ +#include "libxl_osdeps.h" /* must come before any other headers */ + +#define VNUMA_NO_NODE ~((unsigned int)0) + +/* Max vNUMA node size from Linux. */ +#define MIN_VNODE_SIZE 32U + +#define MAX_VNUMA_NODES (unsigned int)1 << 10 diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c index 68df548..5d91c2c 100644 --- a/tools/libxl/xl_cmdimpl.c +++ b/tools/libxl/xl_cmdimpl.c @@ -40,6 +40,7 @@ #include "libxl_json.h" #include "libxlutil.h" #include "xl.h" +#include "libxl_vnuma.h" /* For calls which return an errno on failure */ #define CHK_ERRNOVAL( call ) ({ \ @@ -690,6 +691,423 @@ static void parse_top_level_sdl_options(XLU_Config *config, xlu_cfg_replace_string (config, "xauthority", &sdl->xauthority, 0); } + +static unsigned int get_list_item_uint(XLU_ConfigList *list, unsigned int i) +{ + const char *buf; + char *ep; + unsigned long ul; + int rc = -EINVAL; + + buf = xlu_cfg_get_listitem(list, i); + if (!buf) + return rc; + ul = strtoul(buf, &ep, 10); + if (ep == buf) + return rc; + if (ul >= UINT16_MAX) + return rc; + return (unsigned int)ul; +} + +static void vdistance_set(unsigned int *vdistance, + unsigned int nr_vnodes, + unsigned int samenode, + unsigned int othernode) +{ + unsigned int idx, slot; + for (idx = 0; idx < nr_vnodes; idx++) + for (slot = 0; slot < nr_vnodes; slot++) + *(vdistance + slot * nr_vnodes + idx) = + idx == slot ? samenode : othernode; +} + +static void vcputovnode_default(unsigned int *cpu_to_node, + unsigned int nr_vnodes, + unsigned int max_vcpus) +{ + unsigned int cpu; + for (cpu = 0; cpu < max_vcpus; cpu++) + cpu_to_node[cpu] = cpu % nr_vnodes; +} + +/* Split domain memory between vNUMA nodes equally. */ +static int split_vnumamem(libxl_domain_build_info *b_info) +{ + unsigned long long vnodemem = 0; + unsigned long n; + unsigned int i; + + if (b_info->nr_nodes == 0) + return -1; + + vnodemem = (b_info->max_memkb >> 10) / b_info->nr_nodes; + if (vnodemem < MIN_VNODE_SIZE) + return -1; + /* reminder in MBytes. */ + n = (b_info->max_memkb >> 10) % b_info->nr_nodes; + /* get final sizes in MBytes. */ + for (i = 0; i < (b_info->nr_nodes - 1); i++) + b_info->vnuma_mem[i] = vnodemem; + /* add the reminder to the last node. */ + b_info->vnuma_mem[i] = vnodemem + n; + return 0; +} + +static void vnuma_vnodemap_default(unsigned int *vnuma_vnodemap, + unsigned int nr_vnodes) +{ + unsigned int i; + for (i = 0; i < nr_vnodes; i++) + vnuma_vnodemap[i] = VNUMA_NO_NODE; +} + +/* + * init vNUMA to "zero config" with one node and all other + * topology parameters set to default. + */ +static int vnuma_zero_config(libxl_domain_build_info *b_info) +{ + b_info->nr_nodes = 1; + /* all memory goes to this one vnode, as well as vcpus. */ + if (!(b_info->vnuma_mem = (uint64_t *)calloc(b_info->nr_nodes, + sizeof(*b_info->vnuma_mem)))) + goto bad_vnumazerocfg; + + if (!(b_info->vnuma_vcpumap = (unsigned int *)calloc(b_info->max_vcpus, + sizeof(*b_info->vnuma_vcpumap)))) + goto bad_vnumazerocfg; + + if (!(b_info->vdistance = (unsigned int *)calloc(b_info->nr_nodes * + b_info->nr_nodes, sizeof(*b_info->vdistance)))) + goto bad_vnumazerocfg; + + if (!(b_info->vnuma_vnodemap = (unsigned int *)calloc(b_info->nr_nodes, + sizeof(*b_info->vnuma_vnodemap)))) + goto bad_vnumazerocfg; + + b_info->vnuma_mem[0] = b_info->max_memkb >> 10; + + /* all vcpus assigned to this vnode. */ + vcputovnode_default(b_info->vnuma_vcpumap, b_info->nr_nodes, + b_info->max_vcpus); + + /* default vdistance is 10. */ + vdistance_set(b_info->vdistance, b_info->nr_nodes, 10, 10); + + /* VNUMA_NO_NODE for vnode_to_pnode. */ + vnuma_vnodemap_default(b_info->vnuma_vnodemap, b_info->nr_nodes); + + /* + * will be placed to some physical nodes defined by automatic + * numa placement or VNUMA_NO_NODE will not request exact node. + */ + libxl_defbool_set(&b_info->vnuma_autoplacement, true); + return 0; + + bad_vnumazerocfg: + return -1; +} + +static void free_vnuma_info(libxl_domain_build_info *b_info) +{ + free(b_info->vnuma_mem); + free(b_info->vdistance); + free(b_info->vnuma_vcpumap); + free(b_info->vnuma_vnodemap); + b_info->nr_nodes = 0; +} + +static int parse_vnuma_mem(XLU_Config *config, + libxl_domain_build_info **b_info) +{ + libxl_domain_build_info *dst; + XLU_ConfigList *vnumamemcfg; + int nr_vnuma_regions, i; + unsigned long long vnuma_memparsed = 0; + unsigned long ul; + const char *buf; + + dst = *b_info; + if (!xlu_cfg_get_list(config, "vnuma_mem", + &vnumamemcfg, &nr_vnuma_regions, 0)) { + + if (nr_vnuma_regions != dst->nr_nodes) { + fprintf(stderr, "Number of numa regions (vnumamem = %d) is \ + incorrect (should be %d).\n", nr_vnuma_regions, + dst->nr_nodes); + goto bad_vnuma_mem; + } + + dst->vnuma_mem = calloc(dst->nr_nodes, + sizeof(*dst->vnuma_mem)); + if (dst->vnuma_mem == NULL) { + fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n"); + goto bad_vnuma_mem; + } + + char *ep; + /* + * Will parse only nr_vnodes times, even if we have more/less regions. + * Take care of it later if less or discard if too many regions. + */ + for (i = 0; i < dst->nr_nodes; i++) { + buf = xlu_cfg_get_listitem(vnumamemcfg, i); + if (!buf) { + fprintf(stderr, + "xl: Unable to get element %d in vnuma memory list.\n", i); + if (vnuma_zero_config(dst)) + goto bad_vnuma_mem; + + } + ul = strtoul(buf, &ep, 10); + if (ep == buf) { + fprintf(stderr, "xl: Invalid argument parsing vnumamem: %s.\n", buf); + if (vnuma_zero_config(dst)) + goto bad_vnuma_mem; + } + + /* 32Mb is a min size for a node, taken from Linux */ + if (ul >= UINT32_MAX || ul < MIN_VNODE_SIZE) { + fprintf(stderr, "xl: vnuma memory %lu is not within %u - %u range.\n", + ul, MIN_VNODE_SIZE, UINT32_MAX); + if (vnuma_zero_config(dst)) + goto bad_vnuma_mem; + } + + /* memory in MBytes */ + dst->vnuma_mem[i] = ul; + } + + /* Total memory for vNUMA parsed to verify */ + for (i = 0; i < nr_vnuma_regions; i++) + vnuma_memparsed = vnuma_memparsed + (dst->vnuma_mem[i]); + + /* Amount of memory for vnodes same as total? */ + if ((vnuma_memparsed << 10) != (dst->max_memkb)) { + fprintf(stderr, "xl: vnuma memory is not the same as domain \ + memory size.\n"); + goto bad_vnuma_mem; + } + } else { + dst->vnuma_mem = calloc(dst->nr_nodes, + sizeof(*dst->vnuma_mem)); + if (dst->vnuma_mem == NULL) { + fprintf(stderr, "Unable to allocate memory for vnuma ranges.\n"); + goto bad_vnuma_mem; + } + + fprintf(stderr, "WARNING: vNUMA memory ranges were not specified.\n"); + fprintf(stderr, "Using default equal vnode memory size %lu Kbytes \ + to cover %lu Kbytes.\n", + dst->max_memkb / dst->nr_nodes, dst->max_memkb); + + if (split_vnumamem(dst) < 0) { + fprintf(stderr, "Could not split vnuma memory into equal chunks.\n"); + goto bad_vnuma_mem; + } + } + return 0; + + bad_vnuma_mem: + return -1; +} + +static int parse_vnuma_distance(XLU_Config *config, + libxl_domain_build_info **b_info) +{ + libxl_domain_build_info *dst; + XLU_ConfigList *vdistancecfg; + int nr_vdist; + + dst = *b_info; + dst->vdistance = calloc(dst->nr_nodes * dst->nr_nodes, + sizeof(*dst->vdistance)); + if (dst->vdistance == NULL) + goto bad_distance; + + if (!xlu_cfg_get_list(config, "vdistance", &vdistancecfg, &nr_vdist, 0)) { + int d1, d2; + /* + * First value is the same node distance, the second as the + * rest of distances. The following is required right now to + * avoid non-symmetrical distance table as it may break latest kernel. + * TODO: Better way to analyze extended distance table, possibly + * OS specific. + */ + d1 = get_list_item_uint(vdistancecfg, 0); + if (dst->nr_nodes > 1) + d2 = get_list_item_uint(vdistancecfg, 1); + else + d2 = d1; + + if (d1 >= 0 && d2 >= 0) { + if (d1 < d2) + fprintf(stderr, "WARNING: vnuma distance d1 < d2, %u < %u\n", d1, d2); + vdistance_set(dst->vdistance, dst->nr_nodes, d1, d2); + } else { + fprintf(stderr, "WARNING: vnuma distance values are incorrect.\n"); + goto bad_distance; + } + + } else { + fprintf(stderr, "Could not parse vnuma distances.\n"); + vdistance_set(dst->vdistance, dst->nr_nodes, 10, 20); + } + return 0; + + bad_distance: + return -1; +} + +static int parse_vnuma_vcpumap(XLU_Config *config, + libxl_domain_build_info **b_info) +{ + libxl_domain_build_info *dst; + XLU_ConfigList *vcpumap; + int nr_vcpumap, i; + + dst = *b_info; + dst->vnuma_vcpumap = (unsigned int *)calloc(dst->max_vcpus, + sizeof(*dst->vnuma_vcpumap)); + if (dst->vnuma_vcpumap == NULL) + goto bad_vcpumap; + + if (!xlu_cfg_get_list(config, "vnuma_vcpumap", + &vcpumap, &nr_vcpumap, 0)) { + if (nr_vcpumap == dst->max_vcpus) { + unsigned int vnode, vcpumask = 0, vmask; + vmask = ~(~0 << nr_vcpumap); + for (i = 0; i < nr_vcpumap; i++) { + vnode = get_list_item_uint(vcpumap, i); + if (vnode >= 0 && vnode < dst->nr_nodes) { + vcpumask |= (1 << i); + dst->vnuma_vcpumap[i] = vnode; + } + } + + /* Did it covered all vnodes in the vcpu mask? */ + if ( !(((vmask & vcpumask) + 1) == (1 << nr_vcpumap)) ) { + fprintf(stderr, "WARNING: Not all vnodes were covered \ + in numa_cpumask.\n"); + goto bad_vcpumap; + } + } else { + fprintf(stderr, "WARNING: Bad vnuma_vcpumap.\n"); + goto bad_vcpumap; + } + } + else + vcputovnode_default(dst->vnuma_vcpumap, + dst->nr_nodes, + dst->max_vcpus); + return 0; + + bad_vcpumap: + return -1; +} + +static int parse_vnuma_vnodemap(XLU_Config *config, + libxl_domain_build_info **b_info) +{ + libxl_domain_build_info *dst; + XLU_ConfigList *vnodemap; + int nr_vnodemap, i; + + dst = *b_info; + + /* There is mapping to NUMA physical nodes? */ + dst->vnuma_vnodemap = (unsigned int *)calloc(dst->nr_nodes, + sizeof(*dst->vnuma_vnodemap)); + if (dst->vnuma_vnodemap == NULL) + goto bad_vnodemap; + if (!xlu_cfg_get_list(config, "vnuma_vnodemap",&vnodemap, + &nr_vnodemap, 0)) { + /* + * If not specified or incorred, will be defined + * later based on the machine architecture, configuration + * and memory availble when creating domain. + */ + libxl_defbool_set(&dst->vnuma_autoplacement, false); + if (nr_vnodemap == dst->nr_nodes) { + unsigned int vnodemask = 0, pnode, smask; + smask = ~(~0 << dst->nr_nodes); + for (i = 0; i < dst->nr_nodes; i++) { + pnode = get_list_item_uint(vnodemap, i); + if (pnode >= 0) { + vnodemask |= (1 << i); + dst->vnuma_vnodemap[i] = pnode; + } + } + + /* Did it covered all vnodes in the mask? */ + if ( !(((vnodemask & smask) + 1) == (1 << nr_vnodemap)) ) { + fprintf(stderr, "WARNING: Not all vnodes were covered \ + vnuma_vnodemap.\n"); + fprintf(stderr, "Automatic placement will be used for vnodes.\n"); + libxl_defbool_set(&dst->vnuma_autoplacement, true); + vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->nr_nodes); + } + } + else { + fprintf(stderr, "WARNING: Incorrect vnuma_vnodemap.\n"); + fprintf(stderr, "Automatic placement will be used for vnodes.\n"); + libxl_defbool_set(&dst->vnuma_autoplacement, true); + vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->nr_nodes); + } + } + else { + fprintf(stderr, "WARNING: Missing vnuma_vnodemap.\n"); + fprintf(stderr, "Automatic placement will be used for vnodes.\n"); + libxl_defbool_set(&dst->vnuma_autoplacement, true); + vnuma_vnodemap_default(dst->vnuma_vnodemap, dst->nr_nodes); + } + return 0; + + bad_vnodemap: + return -1; + +} + +static void parse_vnuma_config(XLU_Config *config, + libxl_domain_build_info *b_info) +{ + long l; + + if (!xlu_cfg_get_long (config, "vnodes", &l, 0)) { + if (l > MAX_VNUMA_NODES) { + fprintf(stderr, "Too many vnuma nodes, max %d is allowed.\n", + MAX_VNUMA_NODES); + goto bad_vnuma_config; + } + b_info->nr_nodes = l; + + if (!xlu_cfg_get_defbool(config, "vnuma_autoplacement", + &b_info->vnuma_autoplacement, 0)) + libxl_defbool_set(&b_info->vnuma_autoplacement, false); + + /* Only construct nodes with at least one vcpu for now */ + if (b_info->nr_nodes != 0 && b_info->max_vcpus >= b_info->nr_nodes) { + + if (parse_vnuma_mem(config, &b_info) || + parse_vnuma_distance(config, &b_info) || + parse_vnuma_vcpumap(config, &b_info) || + parse_vnuma_vnodemap(config, &b_info)) + goto bad_vnuma_config; + } + else if (vnuma_zero_config(b_info)) + goto bad_vnuma_config; + } + /* If vnuma topology is not defined for domain, init one node */ + else if (vnuma_zero_config(b_info)) + goto bad_vnuma_config; + return; + + bad_vnuma_config: + free_vnuma_info(b_info); + exit(1); +} + static void parse_config_data(const char *config_source, const char *config_data, int config_len, @@ -1021,6 +1439,13 @@ static void parse_config_data(const char *config_source, exit(1); } + + /* + * If there is no vnuma in config, "zero" vnuma config + * will be initialized with one node and other defaults. + */ + parse_vnuma_config(config, b_info); + xlu_cfg_replace_string (config, "bootloader", &b_info->u.pv.bootloader, 0); switch (xlu_cfg_get_list_as_string_list(config, "bootloader_args", &b_info->u.pv.bootloader_args, 1)) -- 1.7.10.4 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |