[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v10 1/9] xen: vnuma topology and subop hypercalls



Define interface, structures and hypercalls for toolstack to
build vnuma topology and for guests that wish to retrieve it.
Two subop hypercalls introduced by patch:
XEN_DOMCTL_setvnumainfo to define vNUMA domain topology per domain
and XENMEM_get_vnumainfo to retrieve that topology by guest.

Changes since v9:
    - in XENMEM_get_vnumainfo adds condition for successful hypercall
    completion if vnuma topology was changed while allocation temp arrays
    and allocated arrays are big enough to hold new values;

Signed-off-by: Elena Ufimtseva <ufimtseva@xxxxxxxxx>
---
 xen/common/domain.c         |    3 +
 xen/common/domctl.c         |  137 +++++++++++++++++++++++++++++++++++++++++++
 xen/common/memory.c         |  123 ++++++++++++++++++++++++++++++++++++++
 xen/include/public/domctl.h |   31 ++++++++++
 xen/include/public/memory.h |   52 +++++++++++++++-
 xen/include/xen/domain.h    |   12 ++++
 xen/include/xen/sched.h     |    4 ++
 7 files changed, 361 insertions(+), 1 deletion(-)

diff --git a/xen/common/domain.c b/xen/common/domain.c
index 1952070..eaa0e93 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -280,6 +280,8 @@ struct domain *domain_create(
 
     spin_lock_init(&d->pbuf_lock);
 
+    rwlock_init(&d->vnuma_rwlock);
+
     err = -ENOMEM;
     if ( !zalloc_cpumask_var(&d->domain_dirty_cpumask) )
         goto fail;
@@ -606,6 +608,7 @@ int domain_kill(struct domain *d)
         evtchn_destroy(d);
         gnttab_release_mappings(d);
         tmem_destroy(d->tmem_client);
+        vnuma_destroy(d->vnuma);
         domain_set_outstanding_pages(d, 0);
         d->tmem_client = NULL;
         /* fallthrough */
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
index c326aba..7ecf633 100644
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -297,6 +297,120 @@ int vcpuaffinity_params_invalid(const 
xen_domctl_vcpuaffinity_t *vcpuaff)
             guest_handle_is_null(vcpuaff->cpumap_soft.bitmap));
 }
 
+void vnuma_destroy(struct vnuma_info *vnuma)
+{
+    if ( vnuma )
+    {
+        xfree(vnuma->vmemrange);
+        xfree(vnuma->vcpu_to_vnode);
+        xfree(vnuma->vdistance);
+        xfree(vnuma->vnode_to_pnode);
+        xfree(vnuma);
+    }
+}
+
+/*
+ * Allocates memory for vNUMA, **vnuma should be NULL.
+ * Caller has to make sure that domain has max_pages
+ * and number of vcpus set for domain.
+ * Verifies that single allocation does not exceed
+ * PAGE_SIZE.
+ */
+static struct vnuma_info *vnuma_alloc(unsigned int nr_vnodes,
+                                      unsigned int nr_regions,
+                                      unsigned int nr_vcpus)
+{
+
+    struct vnuma_info *vnuma;
+
+    /*
+     * Check if any of the allocations are bigger than PAGE_SIZE.
+     * See XSA-77.
+     */
+    if ( nr_vnodes * nr_vnodes > (PAGE_SIZE / sizeof(*vnuma->vdistance)) ||
+         nr_regions > (PAGE_SIZE / sizeof(*vnuma->vmemrange)) )
+        return ERR_PTR(-EINVAL);
+
+    /*
+     * If allocations become larger then PAGE_SIZE, these allocations
+     * should be split into PAGE_SIZE allocations due to XSA-77.
+     */
+    vnuma = xmalloc(struct vnuma_info);
+    if ( !vnuma )
+        return ERR_PTR(-ENOMEM);
+
+    vnuma->vdistance = xmalloc_array(unsigned int, nr_vnodes * nr_vnodes);
+    vnuma->vcpu_to_vnode = xmalloc_array(unsigned int, nr_vcpus);
+    vnuma->vnode_to_pnode = xmalloc_array(unsigned int, nr_vnodes);
+    vnuma->vmemrange = xmalloc_array(vmemrange_t, nr_regions);
+
+    if ( vnuma->vdistance == NULL || vnuma->vmemrange == NULL ||
+         vnuma->vcpu_to_vnode == NULL || vnuma->vnode_to_pnode == NULL )
+    {
+        vnuma_destroy(vnuma);
+        return ERR_PTR(-ENOMEM);
+    }
+
+    return vnuma;
+}
+
+/*
+ * Construct vNUMA topology form uinfo.
+ */
+static struct vnuma_info *vnuma_init(const struct xen_domctl_vnuma *uinfo,
+                                     const struct domain *d)
+{
+    unsigned int nr_vnodes;
+    int i, ret = -EINVAL;
+    struct vnuma_info *info;
+
+    nr_vnodes = uinfo->nr_vnodes;
+
+    if ( nr_vnodes == 0 || nr_vnodes > uinfo->nr_vmemranges ||
+         uinfo->nr_vcpus != d->max_vcpus || uinfo->pad != 0 )
+        return ERR_PTR(ret);
+
+    info = vnuma_alloc(nr_vnodes, uinfo->nr_vmemranges, d->max_vcpus);
+    if ( IS_ERR(info) )
+        return NULL;
+
+    ret = -EFAULT;
+
+    if ( copy_from_guest(info->vdistance, uinfo->vdistance,
+                         nr_vnodes * nr_vnodes) )
+        goto vnuma_fail;
+
+    if ( copy_from_guest(info->vcpu_to_vnode, uinfo->vcpu_to_vnode,
+                         d->max_vcpus) )
+        goto vnuma_fail;
+
+    if ( copy_from_guest(info->vnode_to_pnode, uinfo->vnode_to_pnode,
+                         nr_vnodes) )
+        goto vnuma_fail;
+
+    if (copy_from_guest(info->vmemrange, uinfo->vmemrange, 
uinfo->nr_vmemranges))
+        goto vnuma_fail;
+
+    info->nr_vnodes = nr_vnodes;
+    info->nr_vmemranges = uinfo->nr_vmemranges;
+
+    /* Check that vmemranges flags are zero. */
+    for ( i = 0; i < info->nr_vmemranges; i++ )
+    {
+        if ( info->vmemrange[i].flags != 0 )
+        {
+            ret = -EINVAL;
+            goto vnuma_fail;
+        }
+    }
+
+    return info;
+
+ vnuma_fail:
+    vnuma_destroy(info);
+    return ERR_PTR(ret);
+}
+
 long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
 {
     long ret = 0;
@@ -967,6 +1081,29 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) 
u_domctl)
     }
     break;
 
+    case XEN_DOMCTL_setvnumainfo:
+    {
+        struct vnuma_info *vnuma;
+
+        vnuma = vnuma_init(&op->u.vnuma, d);
+        if ( IS_ERR(vnuma) )
+        {
+            ret = -PTR_ERR(vnuma);
+            break;
+        }
+
+        ASSERT(vnuma != NULL);
+
+        /* overwrite vnuma topology for domain. */
+        write_lock(&d->vnuma_rwlock);
+        vnuma_destroy(d->vnuma);
+        d->vnuma = vnuma;
+        write_unlock(&d->vnuma_rwlock);
+
+        ret = 0;
+    }
+    break;
+
     default:
         ret = arch_do_domctl(op, d, u_domctl);
         break;
diff --git a/xen/common/memory.c b/xen/common/memory.c
index c2dd31b..fe85ffa 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -969,6 +969,129 @@ long do_memory_op(unsigned long cmd, 
XEN_GUEST_HANDLE_PARAM(void) arg)
 
         break;
 
+    case XENMEM_get_vnumainfo:
+    {
+        struct vnuma_topology_info topology;
+        struct domain *d;
+        unsigned int dom_vnodes, dom_vranges, dom_vcpus;
+        struct vnuma_info tmp;
+
+        /*
+         * Guest passes nr_vnodes, number of regions and nr_vcpus thus
+         * we know how much memory guest has allocated.
+         */
+        if ( copy_from_guest(&topology, arg, 1 ))
+            return -EFAULT;
+
+        if ( (d = rcu_lock_domain_by_any_id(topology.domid)) == NULL )
+            return -ESRCH;
+
+        read_lock(&d->vnuma_rwlock);
+
+        if ( d->vnuma == NULL )
+        {
+            read_unlock(&d->vnuma_rwlock);
+            rcu_unlock_domain(d);
+            return -EOPNOTSUPP;
+        }
+
+        dom_vnodes = d->vnuma->nr_vnodes;
+        dom_vranges = d->vnuma->nr_vmemranges;
+        dom_vcpus = d->max_vcpus;
+
+        /*
+         * Copied from guest values may differ from domain vnuma config.
+         * Check here guest parameters make sure we dont overflow.
+         * Additionaly check padding.
+         */
+        if ( topology.nr_vnodes < dom_vnodes      ||
+             topology.nr_vcpus < dom_vcpus        ||
+             topology.nr_vmemranges < dom_vranges ||
+             topology.pad != 0 )
+        {
+            read_unlock(&d->vnuma_rwlock);
+            rcu_unlock_domain(d);
+
+            topology.nr_vnodes = dom_vnodes;
+            topology.nr_vcpus = dom_vcpus;
+            topology.nr_vmemranges = dom_vranges;
+
+            /* Copy back needed values. */
+             __copy_to_guest(arg, &topology, 1);
+
+            return -ENOBUFS;
+        }
+
+        read_unlock(&d->vnuma_rwlock);
+
+        tmp.vdistance = xmalloc_array(unsigned int, dom_vnodes * dom_vnodes);
+        tmp.vmemrange = xmalloc_array(vmemrange_t, dom_vranges);
+        tmp.vcpu_to_vnode = xmalloc_array(unsigned int, dom_vcpus);
+
+        if ( tmp.vdistance == NULL || tmp.vmemrange == NULL ||
+             tmp.vcpu_to_vnode == NULL )
+        {
+            rc = -ENOMEM;
+            goto vnumainfo_out;
+        }
+
+        /*
+         * Check if vnuma info has changed and if the allocated arrays
+         * are not big enough.
+         */
+        read_lock(&d->vnuma_rwlock);
+
+        if ( dom_vnodes < d->vnuma->nr_vnodes ||
+             dom_vranges < d->vnuma->nr_vmemranges ||
+             dom_vcpus < d->max_vcpus )
+        {
+            read_unlock(&d->vnuma_rwlock);
+            rc = -EAGAIN;
+            goto vnumainfo_out;
+        }
+
+        dom_vnodes = d->vnuma->nr_vnodes;
+        dom_vranges = d->vnuma->nr_vmemranges;
+        dom_vcpus = d->max_vcpus;
+
+        memcpy(tmp.vmemrange, d->vnuma->vmemrange,
+               sizeof(*d->vnuma->vmemrange) * dom_vranges);
+        memcpy(tmp.vdistance, d->vnuma->vdistance,
+               sizeof(*d->vnuma->vdistance) * dom_vnodes * dom_vnodes);
+        memcpy(tmp.vcpu_to_vnode, d->vnuma->vcpu_to_vnode,
+               sizeof(*d->vnuma->vcpu_to_vnode) * dom_vcpus);
+
+        read_unlock(&d->vnuma_rwlock);
+
+        if ( copy_to_guest(topology.vmemrange.h, tmp.vmemrange,
+                           dom_vranges) != 0 )
+            goto vnumainfo_out;
+
+        if ( copy_to_guest(topology.vdistance.h, tmp.vdistance,
+                           dom_vnodes * dom_vnodes) != 0 )
+            goto vnumainfo_out;
+
+        if ( copy_to_guest(topology.vcpu_to_vnode.h, tmp.vcpu_to_vnode,
+                           dom_vcpus) != 0 )
+            goto vnumainfo_out;
+
+        topology.nr_vnodes = dom_vnodes;
+        topology.nr_vcpus = dom_vcpus;
+        topology.nr_vmemranges = dom_vranges;
+
+        if ( __copy_to_guest(arg, &topology, 1) != 0 )
+            rc = -EFAULT;
+        else rc = 0;
+
+ vnumainfo_out:
+        rcu_unlock_domain(d);
+
+        xfree(tmp.vdistance);
+        xfree(tmp.vmemrange);
+        xfree(tmp.vcpu_to_vnode);
+        break;
+    }
+
     default:
         rc = arch_memory_op(cmd, arg);
         break;
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 5b11bbf..07c4fd6 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -35,6 +35,7 @@
 #include "xen.h"
 #include "grant_table.h"
 #include "hvm/save.h"
+#include "memory.h"
 
 #define XEN_DOMCTL_INTERFACE_VERSION 0x0000000a
 
@@ -934,6 +935,34 @@ struct xen_domctl_vcpu_msrs {
 };
 typedef struct xen_domctl_vcpu_msrs xen_domctl_vcpu_msrs_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpu_msrs_t);
+
+/*
+ * Use in XEN_DOMCTL_setvnumainfo to set
+ * vNUMA domain topology.
+ */
+struct xen_domctl_vnuma {
+    uint32_t nr_vnodes;
+    uint32_t nr_vmemranges;
+    uint32_t nr_vcpus;
+    uint32_t pad;
+    XEN_GUEST_HANDLE_64(uint) vdistance;
+    XEN_GUEST_HANDLE_64(uint) vcpu_to_vnode;
+
+    /*
+     * vnodes to physical NUMA nodes mask.
+     * This kept on per-domain basis for
+     * interested consumers, such as numa aware ballooning.
+     */
+    XEN_GUEST_HANDLE_64(uint) vnode_to_pnode;
+
+    /*
+     * memory rages for each vNUMA node
+     */
+    XEN_GUEST_HANDLE_64(vmemrange_t) vmemrange;
+};
+typedef struct xen_domctl_vnuma xen_domctl_vnuma_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_vnuma_t);
+
 #endif
 
 struct xen_domctl {
@@ -1008,6 +1037,7 @@ struct xen_domctl {
 #define XEN_DOMCTL_cacheflush                    71
 #define XEN_DOMCTL_get_vcpu_msrs                 72
 #define XEN_DOMCTL_set_vcpu_msrs                 73
+#define XEN_DOMCTL_setvnumainfo                  74
 #define XEN_DOMCTL_gdbsx_guestmemio            1000
 #define XEN_DOMCTL_gdbsx_pausevcpu             1001
 #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
@@ -1068,6 +1098,7 @@ struct xen_domctl {
         struct xen_domctl_cacheflush        cacheflush;
         struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu;
         struct xen_domctl_gdbsx_domstatus   gdbsx_domstatus;
+        struct xen_domctl_vnuma             vnuma;
         uint8_t                             pad[128];
     } u;
 };
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index 2c57aa0..db961ec 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -521,9 +521,59 @@ DEFINE_XEN_GUEST_HANDLE(xen_mem_sharing_op_t);
  * The zero value is appropiate.
  */
 
+/* vNUMA node memory ranges */
+struct vmemrange {
+    uint64_t start, end;
+    unsigned int flags;
+    unsigned int nid;
+};
+
+typedef struct vmemrange vmemrange_t;
+DEFINE_XEN_GUEST_HANDLE(vmemrange_t);
+
+/*
+ * vNUMA topology specifies vNUMA node number, distance table,
+ * memory ranges and vcpu mapping provided for guests.
+ * XENMEM_get_vnumainfo hypercall expects to see from guest
+ * nr_vnodes, nr_vmemranges and nr_vcpus to indicate available memory.
+ * After filling guests structures, nr_vnodes, nr_vmemranges and nr_vcpus
+ * copied back to guest. Domain returns expected values of nr_vnodes,
+ * nr_vmemranges and nr_vcpus to guest if the values where incorrect.
+ */
+struct vnuma_topology_info {
+    /* IN */
+    domid_t domid;
+    uint16_t pad;
+    /* IN/OUT */
+    unsigned int nr_vnodes;
+    unsigned int nr_vcpus;
+    unsigned int nr_vmemranges;
+    /* OUT */
+    union {
+        XEN_GUEST_HANDLE(uint) h;
+        uint64_t pad;
+    } vdistance;
+    union {
+        XEN_GUEST_HANDLE(uint) h;
+        uint64_t pad;
+    } vcpu_to_vnode;
+    union {
+        XEN_GUEST_HANDLE(vmemrange_t) h;
+        uint64_t pad;
+    } vmemrange;
+};
+typedef struct vnuma_topology_info vnuma_topology_info_t;
+DEFINE_XEN_GUEST_HANDLE(vnuma_topology_info_t);
+
+/*
+ * XENMEM_get_vnumainfo used by guest to get
+ * vNUMA topology from hypervisor.
+ */
+#define XENMEM_get_vnumainfo               26
+
 #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
 
-/* Next available subop number is 26 */
+/* Next available subop number is 27 */
 
 #endif /* __XEN_PUBLIC_MEMORY_H__ */
 
diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
index bb1c398..c5664c2 100644
--- a/xen/include/xen/domain.h
+++ b/xen/include/xen/domain.h
@@ -89,4 +89,16 @@ extern unsigned int xen_processor_pmbits;
 
 extern bool_t opt_dom0_vcpus_pin;
 
+/* vnuma topology per domain. */
+struct vnuma_info {
+    unsigned int nr_vnodes;
+    unsigned int nr_vmemranges;
+    unsigned int *vdistance;
+    unsigned int *vcpu_to_vnode;
+    unsigned int *vnode_to_pnode;
+    struct vmemrange *vmemrange;
+};
+
+void vnuma_destroy(struct vnuma_info *vnuma);
+
 #endif /* __XEN_DOMAIN_H__ */
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 4575dda..c5157e6 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -452,6 +452,10 @@ struct domain
     nodemask_t node_affinity;
     unsigned int last_alloc_node;
     spinlock_t node_affinity_lock;
+
+    /* vNUMA topology accesses are protected by rwlock. */
+    rwlock_t vnuma_rwlock;
+    struct vnuma_info *vnuma;
 };
 
 struct domain_setup_info
-- 
1.7.10.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.