[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH RESEND v7 1/9] xen: vnuma topology and subop hypercalls



>>> On 21.08.14 at 07:08, <ufimtseva@xxxxxxxxx> wrote:
> --- a/xen/common/domctl.c
> +++ b/xen/common/domctl.c
> @@ -297,6 +297,99 @@ int vcpuaffinity_params_invalid(const 
> xen_domctl_vcpuaffinity_t *vcpuaff)
>              guest_handle_is_null(vcpuaff->cpumap_soft.bitmap));
>  }
>  
> +/*
> + * Allocates memory for vNUMA, **vnuma should be NULL.
> + * Caller has to make sure that domain has max_pages
> + * and number of vcpus set for domain.
> + * Verifies that single allocation does not exceed
> + * PAGE_SIZE.
> + */
> +static int vnuma_alloc(struct vnuma_info **vnuma,
> +                       unsigned int nr_vnodes,
> +                       unsigned int nr_vcpus)
> +{
> +    if ( vnuma && *vnuma )

I can see the point of the right side of the &&, but the left side doesn't
seem to make sense in a static function - the callers should get it right,
or it's a bug, not an error.

> +        return -EINVAL;
> +
> +    if ( nr_vnodes > XEN_MAX_VNODES )
> +        return -EINVAL;
> +
> +    /*
> +     * If XEN_MAX_VNODES increases, these allocations
> +     * should be split into PAGE_SIZE allocations
> +     * due to XCA-77.
> +     */
> +    *vnuma = xzalloc(struct vnuma_info);

This could be xmalloc(), since it doesn't get installed into struct
domain anyway before ->nr_vnodes got set properly (and all
the allocations below don't zero their memory either).

> +    if ( !*vnuma )
> +        return -ENOMEM;
> +
> +    (*vnuma)->vdistance = xmalloc_array(unsigned int, nr_vnodes * nr_vnodes);
> +    (*vnuma)->vmemrange = xmalloc_array(vmemrange_t, nr_vnodes);
> +    (*vnuma)->vcpu_to_vnode = xmalloc_array(unsigned int, nr_vcpus);
> +    (*vnuma)->vnode_to_pnode = xmalloc_array(unsigned int, nr_vnodes);
> +
> +    if ( (*vnuma)->vdistance == NULL || (*vnuma)->vmemrange == NULL ||
> +         (*vnuma)->vcpu_to_vnode == NULL || (*vnuma)->vnode_to_pnode == NULL 
> )
> +    {
> +        vnuma_destroy(*vnuma);
> +        return -ENOMEM;
> +    }
> +
> +    return 0;
> +}
> +

I think vnuma_destroy() would better go here than in a different
source file.

> +/*
> + * Construct vNUMA topology form u_vnuma struct and return
> + * it in dst.
> + */
> +long vnuma_init(const struct xen_domctl_vnuma *u_vnuma,
> +                const struct domain *d,
> +                struct vnuma_info **dst)
> +{
> +    unsigned int nr_vnodes;
> +    long ret = -EINVAL;

Any reason for this and the function return type being "long" rather
than "int"?

> +    struct vnuma_info *v = NULL;
> +
> +    /* If vNUMA topology already set, just exit. */
> +    if ( *dst )
> +        return ret;
> +
> +    nr_vnodes = u_vnuma->nr_vnodes;
> +
> +    if ( nr_vnodes == 0 )
> +        return ret;
> +
> +    ret = vnuma_alloc(&v, nr_vnodes, d->max_vcpus);

If you made use of the definitions in xen/err.h you could avoid the
indirection on the first argument (dropping it altogether).

> +    if ( ret )
> +        return ret;
> +
> +    ret = -EFAULT;
> +
> +    if ( copy_from_guest(v->vdistance, u_vnuma->vdistance,
> +                         nr_vnodes * nr_vnodes) )
> +        goto vnuma_fail;
> +
> +    if ( copy_from_guest(v->vmemrange, u_vnuma->vmemrange, nr_vnodes) )
> +        goto vnuma_fail;

Isn't a single memory range per vnode rather limiting? Physical
machines frequently have at least one node with two ranges to
accommodate the hole below 4Gb. And since the interface is for
all guest kinds I'm afraid this will harm setting up guests rather
sooner than later. (I'm sorry for thinking of this only now.)

> +
> +    if ( copy_from_guest(v->vcpu_to_vnode, u_vnuma->vcpu_to_vnode,
> +                         d->max_vcpus) )
> +        goto vnuma_fail;
> +
> +    if ( copy_from_guest(v->vnode_to_pnode, u_vnuma->vnode_to_pnode,
> +                         nr_vnodes) )
> +        goto vnuma_fail;
> +
> +    v->nr_vnodes = nr_vnodes;
> +    *dst = v;
> +
> +    return 0;
> +
> + vnuma_fail:
> +    vnuma_destroy(v);
> +    return ret;
> +}
> +
>  long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
>  {
>      long ret = 0;
> @@ -967,6 +1060,35 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) 
> u_domctl)
>      }
>      break;
>  
> +    case XEN_DOMCTL_setvnumainfo:
> +    {
> +        struct vnuma_info *v = NULL;

Bad variable name (normally stands for vCPU).

> +
> +        ret = -EINVAL;
> +
> +        if ( guest_handle_is_null(op->u.vnuma.vdistance)     ||
> +            guest_handle_is_null(op->u.vnuma.vmemrange)      ||
> +            guest_handle_is_null(op->u.vnuma.vcpu_to_vnode)  ||
> +            guest_handle_is_null(op->u.vnuma.vnode_to_pnode) ) {

Indentation. Also - is there a precedent to doing such checks in
elsewhere? Unless NIL handles have a special meaning we normally
just implicitly produce -EFAULT when accessing the usually not
mapped memory region at address zero.

> --- a/xen/common/memory.c
> +++ b/xen/common/memory.c
> @@ -969,6 +969,105 @@ long do_memory_op(unsigned long cmd, 
> XEN_GUEST_HANDLE_PARAM(void) arg)
>  
>          break;
>  
> +    case XENMEM_get_vnumainfo:
> +    {
> +        struct vnuma_topology_info topology;
> +        struct domain *d;

There is a suitable variable already at function scope.

> +        unsigned int dom_vnodes, dom_vcpus;
> +        struct vnuma_info vnuma_tmp;

Calling this just "tmp" would be quite okay here.

> +
> +        /*
> +         * guest passes nr_vnodes and nr_vcpus thus
> +         * we know how much memory guest has allocated.
> +         */
> +        if ( copy_from_guest(&topology, arg, 1) ||
> +            guest_handle_is_null(topology.vmemrange.h) ||
> +            guest_handle_is_null(topology.vdistance.h) ||
> +            guest_handle_is_null(topology.vcpu_to_vnode.h) )

Same two remarks here.

> +            return -EFAULT;
> +
> +        if ( (d = rcu_lock_domain_by_any_id(topology.domid)) == NULL )
> +            return -ESRCH;
> +
> +        rc = -EOPNOTSUPP;

This is pointless - the set value is being used only in one error path
below where you could as well use the literal.

> +
> +        read_lock(&d->vnuma_rwlock);
> +
> +        if ( d->vnuma == NULL )
> +        {
> +            read_unlock(&d->vnuma_rwlock);
> +            rcu_unlock_domain(d);
> +            return rc;
> +        }
> +
> +        dom_vnodes = d->vnuma->nr_vnodes;
> +        dom_vcpus = d->max_vcpus;
> +
> +        read_unlock(&d->vnuma_rwlock);
> +
> +        vnuma_tmp.vdistance = xmalloc_array(unsigned int,
> +                                            dom_vnodes * dom_vnodes);
> +        vnuma_tmp.vmemrange = xmalloc_array(vmemrange_t, dom_vnodes);
> +        vnuma_tmp.vcpu_to_vnode = xmalloc_array(unsigned int, dom_vcpus);
> +
> +        if ( vnuma_tmp.vdistance == NULL || vnuma_tmp.vmemrange == NULL ||
> +             vnuma_tmp.vcpu_to_vnode == NULL )
> +        {
> +            rc = -ENOMEM;
> +            goto vnumainfo_out;
> +        }

Please use consistent style when the body of the might be just a
singe control transfer: Further down you set "rc" before the if,
here you do it inside its body.

> +
> +        read_lock(&d->vnuma_rwlock);

Even if generally better avoided I think you'd be better off not
dropping the lock, as that way you may return inconsistent data
to your caller. Or alternatively check that the two counts didn't
change by now, starting over if they did.

> +
> +        memcpy(vnuma_tmp.vmemrange, d->vnuma->vmemrange,
> +               sizeof(*d->vnuma->vmemrange) * dom_vnodes);
> +        memcpy(vnuma_tmp.vdistance, d->vnuma->vdistance,
> +               sizeof(*d->vnuma->vdistance) * dom_vnodes * dom_vnodes);
> +        memcpy(vnuma_tmp.vcpu_to_vnode, d->vnuma->vcpu_to_vnode,
> +               sizeof(*d->vnuma->vcpu_to_vnode) * dom_vcpus);
> +
> +        read_unlock(&d->vnuma_rwlock);
> +
> +        /*
> +         * guest nr_cpus and nr_nodes may differ from domain vnuma config.
> +         * Check here guest nr_nodes and nr_cpus to make sure we dont 
> overflow.
> +         */
> +        rc = -ENOBUFS;
> +        if ( topology.nr_vnodes < dom_vnodes ||
> +             topology.nr_vcpus < dom_vcpus )
> +            goto vnumainfo_out;

Wouldn't this better be done earlier (before allocating any of the
intermediate buffers)? And shouldn't you tell the caller the needed
values?

> +
> +        rc = -EFAULT;
> +
> +        if ( copy_to_guest(topology.vmemrange.h, vnuma_tmp.vmemrange,
> +                           dom_vnodes) != 0 )
> +            goto vnumainfo_out;
> +
> +        if ( copy_to_guest(topology.vdistance.h, vnuma_tmp.vdistance,
> +                           dom_vnodes * dom_vnodes) != 0 )
> +            goto vnumainfo_out;
> +
> +        if ( copy_to_guest(topology.vcpu_to_vnode.h, vnuma_tmp.vcpu_to_vnode,
> +                           dom_vcpus) != 0 )
> +            goto vnumainfo_out;
> +
> +        topology.nr_vnodes = dom_vnodes;
> +        topology.nr_vcpus = dom_vcpus;
> +
> +        if ( __copy_to_guest(arg, &topology, 1) != 0 )
> +            goto vnumainfo_out;
> +
> +        rc = 0;
> +
> + vnumainfo_out:
> +        rcu_unlock_domain(d);
> +
> +        xfree(vnuma_tmp.vdistance);
> +        xfree(vnuma_tmp.vmemrange);
> +        xfree(vnuma_tmp.vcpu_to_vnode);
> +        break;
> +    }
> +
>      default:
>          rc = arch_memory_op(cmd, arg);
>          break;
> diff --git a/xen/include/public/arch-x86/xen.h 
> b/xen/include/public/arch-x86/xen.h
> index f35804b..a4c3d58 100644
> --- a/xen/include/public/arch-x86/xen.h
> +++ b/xen/include/public/arch-x86/xen.h
> @@ -108,6 +108,15 @@ typedef unsigned long xen_pfn_t;
>  /* Maximum number of virtual CPUs in legacy multi-processor guests. */
>  #define XEN_LEGACY_MAX_VCPUS 32
>  
> +/*
> + * Maximum number of virtual NUMA nodes per domain.
> + * This restriction is related to a security advice
> + * XSA-77 and max xmalloc size of PAGE_SIZE. This limit
> + * avoids multi page allocation for vnuma. This limit
> + * will be increased in next version.
> + */
> +#define XEN_MAX_VNODES 32

This is pointless (and perhaps in the wrong header) - it's purely an
implementation detail that there currently is such a limit. You don't
even need the #define in an internal header afaict, since the only
place it's being used could easily do a suitable check against PAGE_SIZE.

> --- a/xen/include/public/memory.h
> +++ b/xen/include/public/memory.h
> @@ -521,9 +521,54 @@ DEFINE_XEN_GUEST_HANDLE(xen_mem_sharing_op_t);
>   * The zero value is appropiate.
>   */
>  
> +/* vNUMA node memory range */
> +struct vmemrange {
> +    uint64_t start, end;
> +};
> +
> +typedef struct vmemrange vmemrange_t;
> +DEFINE_XEN_GUEST_HANDLE(vmemrange_t);
> +
> +/*
> + * vNUMA topology specifies vNUMA node number, distance table,
> + * memory ranges and vcpu mapping provided for guests.
> + * XENMEM_get_vnumainfo hypercall expects to see from guest
> + * nr_vnodes and nr_vcpus to indicate available memory. After
> + * filling guests structures, nr_vnodes and nr_vcpus copied
> + * back to guest.
> + */
> +struct vnuma_topology_info {
> +    /* IN */
> +    domid_t domid;
> +    /* IN/OUT */
> +    unsigned int nr_vnodes;

Please make the padding between above two fields explicit.

> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -452,6 +452,10 @@ struct domain
>      nodemask_t node_affinity;
>      unsigned int last_alloc_node;
>      spinlock_t node_affinity_lock;
> +
> +    /* vNUMA topology accesses are protected by rwlock. */
> +    rwlock_t vnuma_rwlock;

I think there's no need for the "rw" in the field name.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.