[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v6 01/10] xen: vnuma topology and subop hypercalls



On Fri, Jul 18, 2014 at 01:50:00AM -0400, Elena Ufimtseva wrote:
> Define interface, structures and hypercalls for toolstack to
> build vnuma topology and for guests that wish to retrieve it.
> Two subop hypercalls introduced by patch:
> XEN_DOMCTL_setvnumainfo to define vNUMA domain topology per domain
> and XENMEM_get_vnumainfo to retrieve that topology by guest.
> 
> Signed-off-by: Elena Ufimtseva <ufimtseva@xxxxxxxxx>
> ---
>  xen/common/domain.c         |   13 ++++
>  xen/common/domctl.c         |  167 
> +++++++++++++++++++++++++++++++++++++++++++
>  xen/common/memory.c         |   62 ++++++++++++++++
>  xen/include/public/domctl.h |   29 ++++++++
>  xen/include/public/memory.h |   47 +++++++++++-
>  xen/include/xen/domain.h    |   11 +++
>  xen/include/xen/sched.h     |    1 +
>  7 files changed, 329 insertions(+), 1 deletion(-)
> 
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index cd64aea..895584a 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -584,6 +584,18 @@ int rcu_lock_live_remote_domain_by_id(domid_t dom, 
> struct domain **d)
>      return 0;
>  }
>  
> +void vnuma_destroy(struct vnuma_info *vnuma)
> +{
> +    if ( vnuma )
> +    {
> +        xfree(vnuma->vmemrange);
> +        xfree(vnuma->vcpu_to_vnode);
> +        xfree(vnuma->vdistance);
> +        xfree(vnuma->vnode_to_pnode);
> +        xfree(vnuma);
> +    }
> +}
> +
>  int domain_kill(struct domain *d)
>  {
>      int rc = 0;
> @@ -602,6 +614,7 @@ int domain_kill(struct domain *d)
>          evtchn_destroy(d);
>          gnttab_release_mappings(d);
>          tmem_destroy(d->tmem_client);
> +        vnuma_destroy(d->vnuma);
>          domain_set_outstanding_pages(d, 0);
>          d->tmem_client = NULL;
>          /* fallthrough */
> diff --git a/xen/common/domctl.c b/xen/common/domctl.c
> index c326aba..7464284 100644
> --- a/xen/common/domctl.c
> +++ b/xen/common/domctl.c
> @@ -297,6 +297,144 @@ int vcpuaffinity_params_invalid(const 
> xen_domctl_vcpuaffinity_t *vcpuaff)
>              guest_handle_is_null(vcpuaff->cpumap_soft.bitmap));
>  }
>  
> +/*
> + * Allocates memory for vNUMA, **vnuma should be NULL.
> + * Caller has to make sure that domain has max_pages
> + * and number of vcpus set for domain.
> + * Verifies that single allocation does not exceed
> + * PAGE_SIZE.
> + */
> +static int vnuma_alloc(struct vnuma_info **vnuma,
> +                       unsigned int nr_vnodes,
> +                       unsigned int nr_vcpus,
> +                       unsigned int dist_size)
> +{
> +    struct vnuma_info *v;
> +
> +    if ( vnuma && *vnuma )
> +        return -EINVAL;
> +
> +    v = *vnuma;
> +    /*
> +     * check if any of xmallocs exeeds PAGE_SIZE.
> +     * If yes, consider it as an error for now.
> +     */
> +    if ( nr_vnodes > PAGE_SIZE / sizeof(nr_vnodes)       ||
> +        nr_vcpus > PAGE_SIZE / sizeof(nr_vcpus)          ||
> +        nr_vnodes > PAGE_SIZE / sizeof(struct vmemrange) ||
> +        dist_size > PAGE_SIZE / sizeof(dist_size) )
> +        return -EINVAL;
> +
> +    v = xzalloc(struct vnuma_info);
> +    if ( !v )
> +        return -ENOMEM;
> +
> +    v->vdistance = xmalloc_array(unsigned int, dist_size);
> +    v->vmemrange = xmalloc_array(vmemrange_t, nr_vnodes);
> +    v->vcpu_to_vnode = xmalloc_array(unsigned int, nr_vcpus);
> +    v->vnode_to_pnode = xmalloc_array(unsigned int, nr_vnodes);
> +
> +    if ( v->vdistance == NULL || v->vmemrange == NULL ||
> +        v->vcpu_to_vnode == NULL || v->vnode_to_pnode == NULL )
> +    {
> +        vnuma_destroy(v);
> +        return -ENOMEM;
> +    }
> +
> +    *vnuma = v;
> +
> +    return 0;
> +}
> +
> +/*
> + * Allocate memory and construct one vNUMA node,
> + * set default parameters, assign all memory and
> + * vcpus to this node, set distance to 10.
> + */
> +static long vnuma_fallback(const struct domain *d,
> +                          struct vnuma_info **vnuma)
> +{
> +    struct vnuma_info *v;
> +    long ret;
> +
> +
> +    /* Will not destroy vNUMA here, destroy before calling this. */
> +    if ( vnuma && *vnuma )
> +        return -EINVAL;
> +
> +    v = *vnuma;
> +    ret = vnuma_alloc(&v, 1, d->max_vcpus, 1);
> +    if ( ret )
> +        return ret;
> +
> +    v->vmemrange[0].start = 0;
> +    v->vmemrange[0].end = d->max_pages << PAGE_SHIFT;
> +    v->vdistance[0] = 10;
> +    v->vnode_to_pnode[0] = NUMA_NO_NODE;
> +    memset(v->vcpu_to_vnode, 0, d->max_vcpus);
> +    v->nr_vnodes = 1;
> +
> +    *vnuma = v;
> +
> +    return 0;
> +}
> +
> +/*
> + * construct vNUMA topology form u_vnuma struct and return
> + * it in dst.
> + */
> +long vnuma_init(const struct xen_domctl_vnuma *u_vnuma,
> +                const struct domain *d,
> +                struct vnuma_info **dst)
> +{
> +    unsigned int dist_size, nr_vnodes = 0;
> +    long ret;
> +    struct vnuma_info *v = NULL;
> +
> +    ret = -EINVAL;
> +
> +    /* If vNUMA topology already set, just exit. */
> +    if ( !u_vnuma || *dst )
> +        return ret;
> +
> +    nr_vnodes = u_vnuma->nr_vnodes;
> +
> +    if ( nr_vnodes == 0 )
> +        return ret;
> +
> +    if ( nr_vnodes > (UINT_MAX / nr_vnodes) )
> +        return ret;
> +
> +    dist_size = nr_vnodes * nr_vnodes;
> +
> +    ret = vnuma_alloc(&v, nr_vnodes, d->max_vcpus, dist_size);
> +    if ( ret )
> +        return ret;
> +
> +    /* On failure, set only one vNUMA node and its success. */
> +    ret = 0;
> +
> +    if ( copy_from_guest(v->vdistance, u_vnuma->vdistance, dist_size) )
> +        goto vnuma_onenode;
> +    if ( copy_from_guest(v->vmemrange, u_vnuma->vmemrange, nr_vnodes) )
> +        goto vnuma_onenode;
> +    if ( copy_from_guest(v->vcpu_to_vnode, u_vnuma->vcpu_to_vnode,
> +        d->max_vcpus) )
> +        goto vnuma_onenode;
> +    if ( copy_from_guest(v->vnode_to_pnode, u_vnuma->vnode_to_pnode,
> +        nr_vnodes) )
> +        goto vnuma_onenode;
> +
> +    v->nr_vnodes = nr_vnodes;
> +    *dst = v;
> +
> +    return ret;
> +
> +vnuma_onenode:
> +    vnuma_destroy(v);
> +    return vnuma_fallback(d, dst);
> +}
> +
>  long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
>  {
>      long ret = 0;
> @@ -967,6 +1105,35 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) 
> u_domctl)
>      }
>      break;
>  
> +    case XEN_DOMCTL_setvnumainfo:
> +    {
> +        struct vnuma_info *v = NULL;
> +
> +        ret = -EFAULT;
> +        if ( guest_handle_is_null(op->u.vnuma.vdistance)     ||
> +            guest_handle_is_null(op->u.vnuma.vmemrange)      ||
> +            guest_handle_is_null(op->u.vnuma.vcpu_to_vnode)  ||
> +            guest_handle_is_null(op->u.vnuma.vnode_to_pnode) )
> +            return ret;
> +
> +        ret = -EINVAL;
> +
> +        ret = vnuma_init(&op->u.vnuma, d, &v);
> +        if ( ret < 0 || v == NULL )
> +            break;
> +
> +        /* overwrite vnuma for domain */
> +        if ( !d->vnuma )

You want that in within the domain_lock.

Otherwise an caller (on another CPU) could try to read the
d->vnuma and blow up. Say by using the serial console and
wanting to read the guest vNUMA topology.

> +            vnuma_destroy(d->vnuma);
> +
> +        domain_lock(d);

I would just do

        vnuma_destroy(d->vnuma)

here and remove the 'if' above.
> +        d->vnuma = v;
> +        domain_unlock(d);
> +
> +        ret = 0;
> +    }
> +    break;
> +
>      default:
>          ret = arch_do_domctl(op, d, u_domctl);
>          break;
> diff --git a/xen/common/memory.c b/xen/common/memory.c
> index c2dd31b..925b9fc 100644
> --- a/xen/common/memory.c
> +++ b/xen/common/memory.c
> @@ -969,6 +969,68 @@ long do_memory_op(unsigned long cmd, 
> XEN_GUEST_HANDLE_PARAM(void) arg)
>  
>          break;
>  
> +    case XENMEM_get_vnumainfo:
> +    {
> +        struct vnuma_topology_info topology;
> +        struct domain *d;
> +        unsigned int dom_vnodes = 0;
> +
> +        /*
> +         * guest passes nr_vnodes and nr_vcpus thus
> +         * we know how much memory guest has allocated.
> +         */
> +        if ( copy_from_guest(&topology, arg, 1) ||
> +            guest_handle_is_null(topology.vmemrange.h) ||
> +            guest_handle_is_null(topology.vdistance.h) ||
> +            guest_handle_is_null(topology.vcpu_to_vnode.h) )
> +            return -EFAULT;
> +
> +        if ( (d = rcu_lock_domain_by_any_id(topology.domid)) == NULL )
> +            return -ESRCH;
> +
> +        rc = -EOPNOTSUPP;
> +        if ( d->vnuma == NULL )
> +            goto vnumainfo_out;
> +
> +        if ( d->vnuma->nr_vnodes == 0 )
> +            goto vnumainfo_out;
> +
> +        dom_vnodes = d->vnuma->nr_vnodes;
> +
> +        /*
> +         * guest nr_cpus and nr_nodes may differ from domain vnuma config.
> +         * Check here guest nr_nodes and nr_cpus to make sure we dont 
> overflow.
> +         */
> +        rc = -ENOBUFS;
> +        if ( topology.nr_vnodes < dom_vnodes ||
> +            topology.nr_vcpus < d->max_vcpus )
> +            goto vnumainfo_out;
> +
> +        rc = -EFAULT;
> +
> +        if ( copy_to_guest(topology.vmemrange.h, d->vnuma->vmemrange,
> +                           dom_vnodes) != 0 )
> +            goto vnumainfo_out;
> +
> +        if ( copy_to_guest(topology.vdistance.h, d->vnuma->vdistance,
> +                           dom_vnodes * dom_vnodes) != 0 )
> +            goto vnumainfo_out;
> +
> +        if ( copy_to_guest(topology.vcpu_to_vnode.h, d->vnuma->vcpu_to_vnode,
> +                           d->max_vcpus) != 0 )
> +            goto vnumainfo_out;
> +
> +        topology.nr_vnodes = dom_vnodes;
> +
> +        if ( copy_to_guest(arg, &topology, 1) != 0 )
> +            goto vnumainfo_out;
> +        rc = 0;
> +
> + vnumainfo_out:
> +        rcu_unlock_domain(d);
> +        break;
> +    }
> +
>      default:
>          rc = arch_memory_op(cmd, arg);
>          break;
> diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
> index 5b11bbf..5ee74f4 100644
> --- a/xen/include/public/domctl.h
> +++ b/xen/include/public/domctl.h
> @@ -35,6 +35,7 @@
>  #include "xen.h"
>  #include "grant_table.h"
>  #include "hvm/save.h"
> +#include "memory.h"
>  
>  #define XEN_DOMCTL_INTERFACE_VERSION 0x0000000a
>  
> @@ -934,6 +935,32 @@ struct xen_domctl_vcpu_msrs {
>  };
>  typedef struct xen_domctl_vcpu_msrs xen_domctl_vcpu_msrs_t;
>  DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpu_msrs_t);
> +
> +/*
> + * Use in XEN_DOMCTL_setvnumainfo to set
> + * vNUMA domain topology.
> + */
> +struct xen_domctl_vnuma {
> +    uint32_t nr_vnodes;
> +    uint32_t _pad;
> +    XEN_GUEST_HANDLE_64(uint) vdistance;
> +    XEN_GUEST_HANDLE_64(uint) vcpu_to_vnode;
> +
> +    /*
> +     * vnodes to physical NUMA nodes mask.
> +     * This kept on per-domain basis for
> +     * interested consumers, such as numa aware ballooning.
> +     */
> +    XEN_GUEST_HANDLE_64(uint) vnode_to_pnode;
> +
> +    /*
> +     * memory rages for each vNUMA node
> +     */
> +    XEN_GUEST_HANDLE_64(vmemrange_t) vmemrange;
> +};
> +typedef struct xen_domctl_vnuma xen_domctl_vnuma_t;
> +DEFINE_XEN_GUEST_HANDLE(xen_domctl_vnuma_t);
> +
>  #endif
>  
>  struct xen_domctl {
> @@ -1008,6 +1035,7 @@ struct xen_domctl {
>  #define XEN_DOMCTL_cacheflush                    71
>  #define XEN_DOMCTL_get_vcpu_msrs                 72
>  #define XEN_DOMCTL_set_vcpu_msrs                 73
> +#define XEN_DOMCTL_setvnumainfo                  74
>  #define XEN_DOMCTL_gdbsx_guestmemio            1000
>  #define XEN_DOMCTL_gdbsx_pausevcpu             1001
>  #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
> @@ -1068,6 +1096,7 @@ struct xen_domctl {
>          struct xen_domctl_cacheflush        cacheflush;
>          struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu;
>          struct xen_domctl_gdbsx_domstatus   gdbsx_domstatus;
> +        struct xen_domctl_vnuma             vnuma;
>          uint8_t                             pad[128];
>      } u;
>  };
> diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
> index 2c57aa0..2c212e1 100644
> --- a/xen/include/public/memory.h
> +++ b/xen/include/public/memory.h
> @@ -521,9 +521,54 @@ DEFINE_XEN_GUEST_HANDLE(xen_mem_sharing_op_t);
>   * The zero value is appropiate.
>   */
>  
> +/* vNUMA node memory range */
> +struct vmemrange {
> +    uint64_t start, end;
> +};
> +
> +typedef struct vmemrange vmemrange_t;
> +DEFINE_XEN_GUEST_HANDLE(vmemrange_t);
> +
> +/*
> + * vNUMA topology specifies vNUMA node number, distance table,
> + * memory ranges and vcpu mapping provided for guests.
> + * XENMEM_get_vnumainfo hypercall expects to see from guest
> + * nr_vnodes and nr_vcpus to indicate available memory. After
> + * filling guests structures, nr_vnodes and nr_vcpus copied
> + * back to guest.
> + */
> +struct vnuma_topology_info {
> +    /* IN */
> +    domid_t domid;
> +    /* IN/OUT */
> +    unsigned int nr_vnodes;
> +    unsigned int nr_vcpus;
> +    /* OUT */
> +    union {
> +        XEN_GUEST_HANDLE(uint) h;
> +        uint64_t pad;
> +    } vdistance;
> +    union {
> +        XEN_GUEST_HANDLE(uint) h;
> +        uint64_t pad;
> +    } vcpu_to_vnode;
> +    union {
> +        XEN_GUEST_HANDLE(vmemrange_t) h;
> +        uint64_t pad;
> +    } vmemrange;
> +};
> +typedef struct vnuma_topology_info vnuma_topology_info_t;
> +DEFINE_XEN_GUEST_HANDLE(vnuma_topology_info_t);
> +
> +/*
> + * XENMEM_get_vnumainfo used by guest to get
> + * vNUMA topology from hypervisor.
> + */
> +#define XENMEM_get_vnumainfo               26
> +
>  #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
>  
> -/* Next available subop number is 26 */
> +/* Next available subop number is 27 */
>  
>  #endif /* __XEN_PUBLIC_MEMORY_H__ */
>  
> diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
> index bb1c398..d29a84d 100644
> --- a/xen/include/xen/domain.h
> +++ b/xen/include/xen/domain.h
> @@ -89,4 +89,15 @@ extern unsigned int xen_processor_pmbits;
>  
>  extern bool_t opt_dom0_vcpus_pin;
>  
> +/* vnuma topology per domain. */
> +struct vnuma_info {
> +    unsigned int nr_vnodes;
> +    unsigned int *vdistance;
> +    unsigned int *vcpu_to_vnode;
> +    unsigned int *vnode_to_pnode;
> +    struct vmemrange *vmemrange;
> +};
> +
> +void vnuma_destroy(struct vnuma_info *vnuma);
> +
>  #endif /* __XEN_DOMAIN_H__ */
> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
> index d5bc461..71e4218 100644
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -447,6 +447,7 @@ struct domain
>      nodemask_t node_affinity;
>      unsigned int last_alloc_node;
>      spinlock_t node_affinity_lock;
> +    struct vnuma_info *vnuma;
>  };
>  
>  struct domain_setup_info
> -- 
> 1.7.10.4
> 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.