[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v5 7/8] libxl: place vnuma domain nodes on numa nodes



Automatic numa placement cancels manual vnode placement
mechanism. If numa placement explicitly specified, try
to fit vnodes to the physical nodes.
This can be changed if needed, but looks like this variant
not too confusing.

Signed-off-by: Elena Ufimtseva <ufimtseva@xxxxxxxxx>
---
 tools/libxl/libxl.c          |   22 ++++++++
 tools/libxl/libxl.h          |   12 ++++
 tools/libxl/libxl_arch.h     |    4 ++
 tools/libxl/libxl_dom.c      |  126 +++++++++++++++++++++++++++++++++++++++++-
 tools/libxl/libxl_internal.h |    3 +
 tools/libxl/libxl_numa.c     |   44 +++++++++++++++
 tools/libxl/libxl_x86.c      |    3 +-
 7 files changed, 212 insertions(+), 2 deletions(-)

diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index 900b8d4..4034a63 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -4701,6 +4701,28 @@ static int libxl__set_vcpuonline_qmp(libxl__gc *gc, 
uint32_t domid,
     return 0;
 }
 
+int libxl_domain_setvnuma(libxl_ctx *ctx,
+                            uint32_t domid,
+                            uint16_t nr_vnodes,
+                            uint16_t nr_vcpus,
+                            vmemrange_t *vmemrange,
+                            unsigned int *vdistance,
+                            unsigned int *vcpu_to_vnode,
+                            unsigned int *vnode_to_pnode)
+{
+    int ret;
+    ret = xc_domain_setvnuma(ctx->xch, domid, nr_vnodes,
+                                nr_vcpus, vmemrange,
+                                vdistance,
+                                vcpu_to_vnode,
+                                vnode_to_pnode);
+    if (ret < 0) {
+        LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "error xxx");
+        return ERROR_FAIL;
+    }
+    return ret;
+}
+
 int libxl_set_vcpuonline(libxl_ctx *ctx, uint32_t domid, libxl_bitmap *cpumap)
 {
     GC_INIT(ctx);
diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h
index 80947c3..f7082ff 100644
--- a/tools/libxl/libxl.h
+++ b/tools/libxl/libxl.h
@@ -308,11 +308,14 @@
 #include <netinet/in.h>
 #include <sys/wait.h> /* for pid_t */
 
+#include <xen/memory.h>
 #include <xentoollog.h>
 
 #include <libxl_uuid.h>
 #include <_libxl_list.h>
 
+#include <xen/vnuma.h>
+
 /* API compatibility. */
 #ifdef LIBXL_API_VERSION
 #if LIBXL_API_VERSION != 0x040200 && LIBXL_API_VERSION != 0x040300 && \
@@ -856,6 +859,15 @@ void libxl_vcpuinfo_list_free(libxl_vcpuinfo *, int 
nr_vcpus);
 void libxl_device_vtpm_list_free(libxl_device_vtpm*, int nr_vtpms);
 void libxl_vtpminfo_list_free(libxl_vtpminfo *, int nr_vtpms);
 
+int libxl_domain_setvnuma(libxl_ctx *ctx,
+                           uint32_t domid,
+                           uint16_t nr_vnodes,
+                           uint16_t nr_vcpus,
+                           vmemrange_t *vmemrange,
+                           unsigned int *vdistance,
+                           unsigned int *vcpu_to_vnode,
+                           unsigned int *vnode_to_pnode);
+
 /*
  * Devices
  * =======
diff --git a/tools/libxl/libxl_arch.h b/tools/libxl/libxl_arch.h
index d3bc136..004ec18 100644
--- a/tools/libxl/libxl_arch.h
+++ b/tools/libxl/libxl_arch.h
@@ -27,4 +27,8 @@ int libxl__arch_domain_init_hw_description(libxl__gc *gc,
 int libxl__arch_domain_finalise_hw_description(libxl__gc *gc,
                                       libxl_domain_build_info *info,
                                       struct xc_dom_image *dom);
+int libxl__arch_domain_configure(libxl__gc *gc,
+                                 libxl_domain_build_info *info,
+                                 struct xc_dom_image *dom);
+
 #endif
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index 661999c..bf922ab 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -23,6 +23,7 @@
 #include <xc_dom.h>
 #include <xen/hvm/hvm_info_table.h>
 #include <xen/hvm/hvm_xs_strings.h>
+#include <libxl_vnuma.h>
 
 libxl_domain_type libxl__domain_type(libxl__gc *gc, uint32_t domid)
 {
@@ -227,6 +228,60 @@ static void hvm_set_conf_params(xc_interface *handle, 
uint32_t domid,
                     libxl_defbool_val(info->u.hvm.nested_hvm));
 }
 
+/* sets vnode_to_pnode map */
+static int libxl__init_vnode_to_pnode(libxl__gc *gc, uint32_t domid,
+                        libxl_domain_build_info *info)
+{
+    unsigned int i, n;
+    int nr_nodes = 0;
+    uint64_t *vnodes_mem;
+    unsigned long long *nodes_claim = NULL;
+    libxl_numainfo *ninfo = NULL;
+
+    if (info->vnode_to_pnode == NULL) {
+        info->vnode_to_pnode = libxl__calloc(gc, info->nr_nodes,
+                                      sizeof(*info->vnode_to_pnode));
+    }
+
+    /* default setting */
+    for (i = 0; i < info->nr_nodes; i++)
+        info->vnode_to_pnode[i] = VNUMA_NO_NODE;
+
+    /* Get NUMA info */
+    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
+    if (ninfo == NULL)
+        return ERROR_FAIL;
+    /* Nothing to see if only one NUMA node */
+    if (nr_nodes <= 1)
+        return 0;
+
+    vnodes_mem = info->numa_memszs;
+    /*
+     * TODO: change algorithm. The current just fits the nodes
+     * Will be nice to have them also sorted by size
+     * If no p-node found, will be set to NUMA_NO_NODE
+     */
+    nodes_claim = libxl__calloc(gc, info->nr_nodes, sizeof(*nodes_claim));
+    if ( !nodes_claim )
+        return ERROR_FAIL;
+
+    libxl_for_each_set_bit(n, info->nodemap)
+    {
+        for (i = 0; i < info->nr_nodes; i++)
+        {
+            if (((nodes_claim[n] + (vnodes_mem[i] << 20)) <= ninfo[n].free) &&
+                 /*vnode was not set yet */
+                 (info->vnode_to_pnode[i] == VNUMA_NO_NODE ) )
+            {
+                info->vnode_to_pnode[i] = n;
+                nodes_claim[n] += (vnodes_mem[i] << 20);
+            }
+        }
+    }
+
+    return 0;
+}
+
 int libxl__build_pre(libxl__gc *gc, uint32_t domid,
               libxl_domain_config *d_config, libxl__domain_build_state *state)
 {
@@ -240,6 +295,22 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
         return ERROR_FAIL;
     }
 
+    /* The memory blocks will be formed here from sizes */
+    struct vmemrange *memrange = libxl__calloc(gc, info->nr_nodes,
+                                            sizeof(*memrange));
+
+    if (libxl__vnuma_align_mem(gc, domid, info, memrange) < 0) {
+        LOG(DETAIL, "Failed to align memory map.\n");
+        return ERROR_FAIL;
+    }
+
+    /* numa_placement and vnuma_autoplacement handling:
+     * If numa_placement is set to default, do not use vnode to pnode
+     * mapping as automatic placement algorithm will find best numa nodes.
+     * If numa_placement is not used, we can try and use domain vnode
+     * to pnode mask.
+     */
+    if (libxl_defbool_val(info->numa_placement)) {
     /*
      * Check if the domain has any CPU affinity. If not, try to build
      * up one. In case numa_place_domain() find at least a suitable
@@ -249,7 +320,6 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
      * libxl_domain_set_nodeaffinity() will do the actual placement,
      * whatever that turns out to be.
      */
-    if (libxl_defbool_val(info->numa_placement)) {
         if (!libxl_bitmap_is_full(&info->cpumap)) {
             LOG(ERROR, "Can run NUMA placement only if no vcpu "
                        "affinity is specified");
@@ -259,7 +329,40 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
         rc = numa_place_domain(gc, domid, info);
         if (rc)
             return rc;
+
+        /* If vnode_to_pnode mask was defined, dont use it if we automatically
+         * place domain on NUMA nodes, just give warning.
+         */
+        if (!libxl_defbool_val(info->vnuma_autoplacement)) {
+            LOG(INFO, "Automatic NUMA placement for domain is turned on. \
+                vnode to physical nodes mapping will not be used.");
+        }
+        if (libxl__init_vnode_to_pnode(gc, domid, info) < 0) {
+            LOG(ERROR, "Failed to build vnode to pnode map\n");
+            return ERROR_FAIL;
+        }
+    } else {
+        if (!libxl_defbool_val(info->vnuma_autoplacement)) {
+                if (!libxl__vnodemap_is_usable(gc, info)) {
+                    LOG(ERROR, "Defined vnode to pnode domain map cannot be 
used.\n");
+                    return ERROR_FAIL;
+                }
+        } else {
+            if (libxl__init_vnode_to_pnode(gc, domid, info) < 0) {
+                LOG(ERROR, "Failed to build vnode to pnode map.\n");
+                return ERROR_FAIL;
+            }
+        }
+    }
+
+    if (xc_domain_setvnuma(ctx->xch, domid, info->nr_nodes,
+                            info->max_vcpus, memrange,
+                            info->distance, info->cpu_to_node,
+                            info->vnode_to_pnode) < 0) {
+       LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "Failed to set vnuma topology 
for domain from\n.");
+       return ERROR_FAIL;
     }
+
     libxl_domain_set_nodeaffinity(ctx, domid, &info->nodemap);
     libxl_set_vcpuaffinity_all(ctx, domid, info->max_vcpus, &info->cpumap);
 
@@ -422,6 +525,26 @@ int libxl__build_pv(libxl__gc *gc, uint32_t domid,
     dom->xenstore_domid = state->store_domid;
     dom->claim_enabled = libxl_defbool_val(info->claim_mode);
 
+    dom->vnode_to_pnode = (unsigned int *)malloc(
+                            info->nr_nodes * sizeof(*info->vnode_to_pnode));
+    dom->numa_memszs = (uint64_t *)malloc(
+                          info->nr_nodes * sizeof(*info->numa_memszs));
+
+    if ( dom->numa_memszs == NULL || dom->vnode_to_pnode == NULL ) {
+        info->nr_nodes = 0;
+        if (dom->vnode_to_pnode) free(dom->vnode_to_pnode);
+        if (dom->numa_memszs) free(dom->numa_memszs);
+        LOGE(ERROR, "Failed to allocate memory for vnuma");
+        goto out;
+    }
+
+    memcpy(dom->numa_memszs, info->numa_memszs,
+            sizeof(*info->numa_memszs) * info->nr_nodes);
+    memcpy(dom->vnode_to_pnode, info->vnode_to_pnode,
+            sizeof(*info->vnode_to_pnode) * info->nr_nodes);
+
+    dom->nr_nodes = info->nr_nodes;
+
     if ( (ret = xc_dom_boot_xen_init(dom, ctx->xch, domid)) != 0 ) {
         LOGE(ERROR, "xc_dom_boot_xen_init failed");
         goto out;
@@ -432,6 +555,7 @@ int libxl__build_pv(libxl__gc *gc, uint32_t domid,
         goto out;
     }
 #endif
+
     if ( (ret = xc_dom_parse_image(dom)) != 0 ) {
         LOGE(ERROR, "xc_dom_parse_image failed");
         goto out;
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 7ae8508..9a71fd9 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -3113,6 +3113,9 @@ void libxl__numa_candidate_put_nodemap(libxl__gc *gc,
  */
 #define CTYPE(isfoo,c) (isfoo((unsigned char)(c)))
 
+unsigned int libxl__vnodemap_is_usable(libxl__gc *gc,
+                                libxl_domain_build_info *info);
+
 int e820_sanitize(libxl_ctx *ctx, struct e820entry src[],
                          uint32_t *nr_entries,
                          unsigned long map_limitkb,
diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c
index 38f1546..7814a63 100644
--- a/tools/libxl/libxl_numa.c
+++ b/tools/libxl/libxl_numa.c
@@ -510,6 +510,50 @@ int libxl__get_numa_candidate(libxl__gc *gc,
 }
 
 /*
+ * Check if we can fit vnuma nodes to numa pnodes
+ * from vnode_to_pnode mask.
+ */
+unsigned int libxl__vnodemap_is_usable(libxl__gc *gc, libxl_domain_build_info 
*info)
+{
+    unsigned int i;
+    libxl_numainfo *ninfo = NULL;
+    unsigned long long *claim;
+    unsigned int node;
+    uint64_t *mems;
+    int rc, nr_nodes;
+
+    rc = nr_nodes = 0;
+
+    /*
+     * Cannot use specified mapping if not NUMA machine
+     */
+    ninfo = libxl_get_numainfo(CTX, &nr_nodes);
+    if (ninfo == NULL)
+        return rc;
+
+    mems = info->numa_memszs;
+    claim = libxl__calloc(gc, info->nr_nodes, sizeof(*claim));
+    /* Get total memory required on each physical node */
+    for (i = 0; i < info->nr_nodes; i++)
+    {
+        node = info->vnode_to_pnode[i];
+        /* Correct pnode number? */
+        if (node < nr_nodes)
+            claim[node] += (mems[i] << 20);
+        else
+            goto vnodemapout;
+   }
+   for (i = 0; i < nr_nodes; i++) {
+       if (claim[i] > ninfo[i].free)
+          /* Cannot complete user request, falling to default */
+          goto vnodemapout;
+   }
+   rc = 1;
+
+ vnodemapout:
+   return rc;
+}
+
 /*
  * Used for PV guest with e802_host enabled and thus
  * having non-contiguous e820 memory map.
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index 7589060..46e84e4 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -1,5 +1,6 @@
 #include "libxl_internal.h"
 #include "libxl_arch.h"
+#include "libxl_vnuma.h"
 
 static const char *e820_names(int type)
 {
@@ -14,7 +15,7 @@ static const char *e820_names(int type)
     return "Unknown";
 }
 
-static int e820_sanitize(libxl_ctx *ctx, struct e820entry src[],
+int e820_sanitize(libxl_ctx *ctx, struct e820entry src[],
                          uint32_t *nr_entries,
                          unsigned long map_limitkb,
                          unsigned long balloon_kb)
-- 
1.7.10.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.