[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v6 01/10] xen: vnuma topology and subop hypercalls



On Fri, Jul 18, 2014 at 9:49 AM, Konrad Rzeszutek Wilk
<konrad.wilk@xxxxxxxxxx> wrote:
> On Fri, Jul 18, 2014 at 01:50:00AM -0400, Elena Ufimtseva wrote:
>> Define interface, structures and hypercalls for toolstack to
>> build vnuma topology and for guests that wish to retrieve it.
>> Two subop hypercalls introduced by patch:
>> XEN_DOMCTL_setvnumainfo to define vNUMA domain topology per domain
>> and XENMEM_get_vnumainfo to retrieve that topology by guest.
>>
>> Signed-off-by: Elena Ufimtseva <ufimtseva@xxxxxxxxx>
>> ---
>>  xen/common/domain.c         |   13 ++++
>>  xen/common/domctl.c         |  167 
>> +++++++++++++++++++++++++++++++++++++++++++
>>  xen/common/memory.c         |   62 ++++++++++++++++
>>  xen/include/public/domctl.h |   29 ++++++++
>>  xen/include/public/memory.h |   47 +++++++++++-
>>  xen/include/xen/domain.h    |   11 +++
>>  xen/include/xen/sched.h     |    1 +
>>  7 files changed, 329 insertions(+), 1 deletion(-)
>>
>> diff --git a/xen/common/domain.c b/xen/common/domain.c
>> index cd64aea..895584a 100644
>> --- a/xen/common/domain.c
>> +++ b/xen/common/domain.c
>> @@ -584,6 +584,18 @@ int rcu_lock_live_remote_domain_by_id(domid_t dom, 
>> struct domain **d)
>>      return 0;
>>  }
>>
>> +void vnuma_destroy(struct vnuma_info *vnuma)
>> +{
>> +    if ( vnuma )
>> +    {
>> +        xfree(vnuma->vmemrange);
>> +        xfree(vnuma->vcpu_to_vnode);
>> +        xfree(vnuma->vdistance);
>> +        xfree(vnuma->vnode_to_pnode);
>> +        xfree(vnuma);
>> +    }
>> +}
>> +
>>  int domain_kill(struct domain *d)
>>  {
>>      int rc = 0;
>> @@ -602,6 +614,7 @@ int domain_kill(struct domain *d)
>>          evtchn_destroy(d);
>>          gnttab_release_mappings(d);
>>          tmem_destroy(d->tmem_client);
>> +        vnuma_destroy(d->vnuma);
>>          domain_set_outstanding_pages(d, 0);
>>          d->tmem_client = NULL;
>>          /* fallthrough */
>> diff --git a/xen/common/domctl.c b/xen/common/domctl.c
>> index c326aba..7464284 100644
>> --- a/xen/common/domctl.c
>> +++ b/xen/common/domctl.c
>> @@ -297,6 +297,144 @@ int vcpuaffinity_params_invalid(const 
>> xen_domctl_vcpuaffinity_t *vcpuaff)
>>              guest_handle_is_null(vcpuaff->cpumap_soft.bitmap));
>>  }
>>
>> +/*
>> + * Allocates memory for vNUMA, **vnuma should be NULL.
>> + * Caller has to make sure that domain has max_pages
>> + * and number of vcpus set for domain.
>> + * Verifies that single allocation does not exceed
>> + * PAGE_SIZE.
>> + */
>> +static int vnuma_alloc(struct vnuma_info **vnuma,
>> +                       unsigned int nr_vnodes,
>> +                       unsigned int nr_vcpus,
>> +                       unsigned int dist_size)
>> +{
>> +    struct vnuma_info *v;
>> +
>> +    if ( vnuma && *vnuma )
>> +        return -EINVAL;
>> +
>> +    v = *vnuma;
>> +    /*
>> +     * check if any of xmallocs exeeds PAGE_SIZE.
>> +     * If yes, consider it as an error for now.
>> +     */
>> +    if ( nr_vnodes > PAGE_SIZE / sizeof(nr_vnodes)       ||
>> +        nr_vcpus > PAGE_SIZE / sizeof(nr_vcpus)          ||
>> +        nr_vnodes > PAGE_SIZE / sizeof(struct vmemrange) ||
>> +        dist_size > PAGE_SIZE / sizeof(dist_size) )
>> +        return -EINVAL;
>> +
>> +    v = xzalloc(struct vnuma_info);
>> +    if ( !v )
>> +        return -ENOMEM;
>> +
>> +    v->vdistance = xmalloc_array(unsigned int, dist_size);
>> +    v->vmemrange = xmalloc_array(vmemrange_t, nr_vnodes);
>> +    v->vcpu_to_vnode = xmalloc_array(unsigned int, nr_vcpus);
>> +    v->vnode_to_pnode = xmalloc_array(unsigned int, nr_vnodes);
>> +
>> +    if ( v->vdistance == NULL || v->vmemrange == NULL ||
>> +        v->vcpu_to_vnode == NULL || v->vnode_to_pnode == NULL )
>> +    {
>> +        vnuma_destroy(v);
>> +        return -ENOMEM;
>> +    }
>> +
>> +    *vnuma = v;
>> +
>> +    return 0;
>> +}
>> +
>> +/*
>> + * Allocate memory and construct one vNUMA node,
>> + * set default parameters, assign all memory and
>> + * vcpus to this node, set distance to 10.
>> + */
>> +static long vnuma_fallback(const struct domain *d,
>> +                          struct vnuma_info **vnuma)
>> +{
>> +    struct vnuma_info *v;
>> +    long ret;
>> +
>> +
>> +    /* Will not destroy vNUMA here, destroy before calling this. */
>> +    if ( vnuma && *vnuma )
>> +        return -EINVAL;
>> +
>> +    v = *vnuma;
>> +    ret = vnuma_alloc(&v, 1, d->max_vcpus, 1);
>> +    if ( ret )
>> +        return ret;
>> +
>> +    v->vmemrange[0].start = 0;
>> +    v->vmemrange[0].end = d->max_pages << PAGE_SHIFT;
>> +    v->vdistance[0] = 10;
>> +    v->vnode_to_pnode[0] = NUMA_NO_NODE;
>> +    memset(v->vcpu_to_vnode, 0, d->max_vcpus);
>> +    v->nr_vnodes = 1;
>> +
>> +    *vnuma = v;
>> +
>> +    return 0;
>> +}
>> +
>> +/*
>> + * construct vNUMA topology form u_vnuma struct and return
>> + * it in dst.
>> + */
>> +long vnuma_init(const struct xen_domctl_vnuma *u_vnuma,
>> +                const struct domain *d,
>> +                struct vnuma_info **dst)
>> +{
>> +    unsigned int dist_size, nr_vnodes = 0;
>> +    long ret;
>> +    struct vnuma_info *v = NULL;
>> +
>> +    ret = -EINVAL;
>> +
>> +    /* If vNUMA topology already set, just exit. */
>> +    if ( !u_vnuma || *dst )
>> +        return ret;
>> +
>> +    nr_vnodes = u_vnuma->nr_vnodes;
>> +
>> +    if ( nr_vnodes == 0 )
>> +        return ret;
>> +
>> +    if ( nr_vnodes > (UINT_MAX / nr_vnodes) )
>> +        return ret;
>> +
>> +    dist_size = nr_vnodes * nr_vnodes;
>> +
>> +    ret = vnuma_alloc(&v, nr_vnodes, d->max_vcpus, dist_size);
>> +    if ( ret )
>> +        return ret;
>> +
>> +    /* On failure, set only one vNUMA node and its success. */
>> +    ret = 0;
>> +
>> +    if ( copy_from_guest(v->vdistance, u_vnuma->vdistance, dist_size) )
>> +        goto vnuma_onenode;
>> +    if ( copy_from_guest(v->vmemrange, u_vnuma->vmemrange, nr_vnodes) )
>> +        goto vnuma_onenode;
>> +    if ( copy_from_guest(v->vcpu_to_vnode, u_vnuma->vcpu_to_vnode,
>> +        d->max_vcpus) )
>> +        goto vnuma_onenode;
>> +    if ( copy_from_guest(v->vnode_to_pnode, u_vnuma->vnode_to_pnode,
>> +        nr_vnodes) )
>> +        goto vnuma_onenode;
>> +
>> +    v->nr_vnodes = nr_vnodes;
>> +    *dst = v;
>> +
>> +    return ret;
>> +
>> +vnuma_onenode:
>> +    vnuma_destroy(v);
>> +    return vnuma_fallback(d, dst);
>> +}
>> +
>>  long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
>>  {
>>      long ret = 0;
>> @@ -967,6 +1105,35 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) 
>> u_domctl)
>>      }
>>      break;
>>
>> +    case XEN_DOMCTL_setvnumainfo:
>> +    {
>> +        struct vnuma_info *v = NULL;
>> +
>> +        ret = -EFAULT;
>> +        if ( guest_handle_is_null(op->u.vnuma.vdistance)     ||
>> +            guest_handle_is_null(op->u.vnuma.vmemrange)      ||
>> +            guest_handle_is_null(op->u.vnuma.vcpu_to_vnode)  ||
>> +            guest_handle_is_null(op->u.vnuma.vnode_to_pnode) )
>> +            return ret;
>> +
>> +        ret = -EINVAL;
>> +
>> +        ret = vnuma_init(&op->u.vnuma, d, &v);
>> +        if ( ret < 0 || v == NULL )
>> +            break;
>> +
>> +        /* overwrite vnuma for domain */
>> +        if ( !d->vnuma )
>
> You want that in within the domain_lock.
>
> Otherwise an caller (on another CPU) could try to read the
> d->vnuma and blow up. Say by using the serial console and
> wanting to read the guest vNUMA topology.
>
>> +            vnuma_destroy(d->vnuma);
>> +
>> +        domain_lock(d);
>
> I would just do
>
>         vnuma_destroy(d->vnuma)
>
> here and remove the 'if' above.
>> +        d->vnuma = v;
>> +        domain_unlock(d);
>> +
>> +        ret = 0;
>> +    }
>> +    break;
>> +

Agree and done )

>>      default:
>>          ret = arch_do_domctl(op, d, u_domctl);
>>          break;
>> diff --git a/xen/common/memory.c b/xen/common/memory.c
>> index c2dd31b..925b9fc 100644
>> --- a/xen/common/memory.c
>> +++ b/xen/common/memory.c
>> @@ -969,6 +969,68 @@ long do_memory_op(unsigned long cmd, 
>> XEN_GUEST_HANDLE_PARAM(void) arg)
>>
>>          break;
>>
>> +    case XENMEM_get_vnumainfo:
>> +    {
>> +        struct vnuma_topology_info topology;
>> +        struct domain *d;
>> +        unsigned int dom_vnodes = 0;
>> +
>> +        /*
>> +         * guest passes nr_vnodes and nr_vcpus thus
>> +         * we know how much memory guest has allocated.
>> +         */
>> +        if ( copy_from_guest(&topology, arg, 1) ||
>> +            guest_handle_is_null(topology.vmemrange.h) ||
>> +            guest_handle_is_null(topology.vdistance.h) ||
>> +            guest_handle_is_null(topology.vcpu_to_vnode.h) )
>> +            return -EFAULT;
>> +
>> +        if ( (d = rcu_lock_domain_by_any_id(topology.domid)) == NULL )
>> +            return -ESRCH;
>> +
>> +        rc = -EOPNOTSUPP;
>> +        if ( d->vnuma == NULL )
>> +            goto vnumainfo_out;
>> +
>> +        if ( d->vnuma->nr_vnodes == 0 )
>> +            goto vnumainfo_out;
>> +
>> +        dom_vnodes = d->vnuma->nr_vnodes;
>> +
>> +        /*
>> +         * guest nr_cpus and nr_nodes may differ from domain vnuma config.
>> +         * Check here guest nr_nodes and nr_cpus to make sure we dont 
>> overflow.
>> +         */
>> +        rc = -ENOBUFS;
>> +        if ( topology.nr_vnodes < dom_vnodes ||
>> +            topology.nr_vcpus < d->max_vcpus )
>> +            goto vnumainfo_out;
>> +
>> +        rc = -EFAULT;
>> +
>> +        if ( copy_to_guest(topology.vmemrange.h, d->vnuma->vmemrange,
>> +                           dom_vnodes) != 0 )
>> +            goto vnumainfo_out;
>> +
>> +        if ( copy_to_guest(topology.vdistance.h, d->vnuma->vdistance,
>> +                           dom_vnodes * dom_vnodes) != 0 )
>> +            goto vnumainfo_out;
>> +
>> +        if ( copy_to_guest(topology.vcpu_to_vnode.h, 
>> d->vnuma->vcpu_to_vnode,
>> +                           d->max_vcpus) != 0 )
>> +            goto vnumainfo_out;
>> +
>> +        topology.nr_vnodes = dom_vnodes;
>> +
>> +        if ( copy_to_guest(arg, &topology, 1) != 0 )
>> +            goto vnumainfo_out;
>> +        rc = 0;
>> +
>> + vnumainfo_out:
>> +        rcu_unlock_domain(d);
>> +        break;
>> +    }
>> +
>>      default:
>>          rc = arch_memory_op(cmd, arg);
>>          break;
>> diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
>> index 5b11bbf..5ee74f4 100644
>> --- a/xen/include/public/domctl.h
>> +++ b/xen/include/public/domctl.h
>> @@ -35,6 +35,7 @@
>>  #include "xen.h"
>>  #include "grant_table.h"
>>  #include "hvm/save.h"
>> +#include "memory.h"
>>
>>  #define XEN_DOMCTL_INTERFACE_VERSION 0x0000000a
>>
>> @@ -934,6 +935,32 @@ struct xen_domctl_vcpu_msrs {
>>  };
>>  typedef struct xen_domctl_vcpu_msrs xen_domctl_vcpu_msrs_t;
>>  DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpu_msrs_t);
>> +
>> +/*
>> + * Use in XEN_DOMCTL_setvnumainfo to set
>> + * vNUMA domain topology.
>> + */
>> +struct xen_domctl_vnuma {
>> +    uint32_t nr_vnodes;
>> +    uint32_t _pad;
>> +    XEN_GUEST_HANDLE_64(uint) vdistance;
>> +    XEN_GUEST_HANDLE_64(uint) vcpu_to_vnode;
>> +
>> +    /*
>> +     * vnodes to physical NUMA nodes mask.
>> +     * This kept on per-domain basis for
>> +     * interested consumers, such as numa aware ballooning.
>> +     */
>> +    XEN_GUEST_HANDLE_64(uint) vnode_to_pnode;
>> +
>> +    /*
>> +     * memory rages for each vNUMA node
>> +     */
>> +    XEN_GUEST_HANDLE_64(vmemrange_t) vmemrange;
>> +};
>> +typedef struct xen_domctl_vnuma xen_domctl_vnuma_t;
>> +DEFINE_XEN_GUEST_HANDLE(xen_domctl_vnuma_t);
>> +
>>  #endif
>>
>>  struct xen_domctl {
>> @@ -1008,6 +1035,7 @@ struct xen_domctl {
>>  #define XEN_DOMCTL_cacheflush                    71
>>  #define XEN_DOMCTL_get_vcpu_msrs                 72
>>  #define XEN_DOMCTL_set_vcpu_msrs                 73
>> +#define XEN_DOMCTL_setvnumainfo                  74
>>  #define XEN_DOMCTL_gdbsx_guestmemio            1000
>>  #define XEN_DOMCTL_gdbsx_pausevcpu             1001
>>  #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
>> @@ -1068,6 +1096,7 @@ struct xen_domctl {
>>          struct xen_domctl_cacheflush        cacheflush;
>>          struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu;
>>          struct xen_domctl_gdbsx_domstatus   gdbsx_domstatus;
>> +        struct xen_domctl_vnuma             vnuma;
>>          uint8_t                             pad[128];
>>      } u;
>>  };
>> diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
>> index 2c57aa0..2c212e1 100644
>> --- a/xen/include/public/memory.h
>> +++ b/xen/include/public/memory.h
>> @@ -521,9 +521,54 @@ DEFINE_XEN_GUEST_HANDLE(xen_mem_sharing_op_t);
>>   * The zero value is appropiate.
>>   */
>>
>> +/* vNUMA node memory range */
>> +struct vmemrange {
>> +    uint64_t start, end;
>> +};
>> +
>> +typedef struct vmemrange vmemrange_t;
>> +DEFINE_XEN_GUEST_HANDLE(vmemrange_t);
>> +
>> +/*
>> + * vNUMA topology specifies vNUMA node number, distance table,
>> + * memory ranges and vcpu mapping provided for guests.
>> + * XENMEM_get_vnumainfo hypercall expects to see from guest
>> + * nr_vnodes and nr_vcpus to indicate available memory. After
>> + * filling guests structures, nr_vnodes and nr_vcpus copied
>> + * back to guest.
>> + */
>> +struct vnuma_topology_info {
>> +    /* IN */
>> +    domid_t domid;
>> +    /* IN/OUT */
>> +    unsigned int nr_vnodes;
>> +    unsigned int nr_vcpus;
>> +    /* OUT */
>> +    union {
>> +        XEN_GUEST_HANDLE(uint) h;
>> +        uint64_t pad;
>> +    } vdistance;
>> +    union {
>> +        XEN_GUEST_HANDLE(uint) h;
>> +        uint64_t pad;
>> +    } vcpu_to_vnode;
>> +    union {
>> +        XEN_GUEST_HANDLE(vmemrange_t) h;
>> +        uint64_t pad;
>> +    } vmemrange;
>> +};
>> +typedef struct vnuma_topology_info vnuma_topology_info_t;
>> +DEFINE_XEN_GUEST_HANDLE(vnuma_topology_info_t);
>> +
>> +/*
>> + * XENMEM_get_vnumainfo used by guest to get
>> + * vNUMA topology from hypervisor.
>> + */
>> +#define XENMEM_get_vnumainfo               26
>> +
>>  #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
>>
>> -/* Next available subop number is 26 */
>> +/* Next available subop number is 27 */
>>
>>  #endif /* __XEN_PUBLIC_MEMORY_H__ */
>>
>> diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
>> index bb1c398..d29a84d 100644
>> --- a/xen/include/xen/domain.h
>> +++ b/xen/include/xen/domain.h
>> @@ -89,4 +89,15 @@ extern unsigned int xen_processor_pmbits;
>>
>>  extern bool_t opt_dom0_vcpus_pin;
>>
>> +/* vnuma topology per domain. */
>> +struct vnuma_info {
>> +    unsigned int nr_vnodes;
>> +    unsigned int *vdistance;
>> +    unsigned int *vcpu_to_vnode;
>> +    unsigned int *vnode_to_pnode;
>> +    struct vmemrange *vmemrange;
>> +};
>> +
>> +void vnuma_destroy(struct vnuma_info *vnuma);
>> +
>>  #endif /* __XEN_DOMAIN_H__ */
>> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
>> index d5bc461..71e4218 100644
>> --- a/xen/include/xen/sched.h
>> +++ b/xen/include/xen/sched.h
>> @@ -447,6 +447,7 @@ struct domain
>>      nodemask_t node_affinity;
>>      unsigned int last_alloc_node;
>>      spinlock_t node_affinity_lock;
>> +    struct vnuma_info *vnuma;
>>  };
>>
>>  struct domain_setup_info
>> --
>> 1.7.10.4
>>



-- 
Elena

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.