Xen project Mailing List

Re: [Xen-devel] [PATCH RESEND v7 1/9] xen: vnuma topology and subop hypercalls

>>> On 21.08.14 at 07:08, <ufimtseva@xxxxxxxxx> wrote: > --- a/xen/common/domctl.c > +++ b/xen/common/domctl.c > @@ -297,6 +297,99 @@ int vcpuaffinity_params_invalid(const > xen_domctl_vcpuaffinity_t *vcpuaff) > guest_handle_is_null(vcpuaff->cpumap_soft.bitmap)); > } > > +/* > + * Allocates memory for vNUMA, **vnuma should be NULL. > + * Caller has to make sure that domain has max_pages > + * and number of vcpus set for domain. > + * Verifies that single allocation does not exceed > + * PAGE_SIZE. > + */ > +static int vnuma_alloc(struct vnuma_info **vnuma, > + unsigned int nr_vnodes, > + unsigned int nr_vcpus) > +{ > + if ( vnuma && *vnuma ) I can see the point of the right side of the &&, but the left side doesn't seem to make sense in a static function - the callers should get it right, or it's a bug, not an error. > + return -EINVAL; > + > + if ( nr_vnodes > XEN_MAX_VNODES ) > + return -EINVAL; > + > + /* > + * If XEN_MAX_VNODES increases, these allocations > + * should be split into PAGE_SIZE allocations > + * due to XCA-77. > + */ > + *vnuma = xzalloc(struct vnuma_info); This could be xmalloc(), since it doesn't get installed into struct domain anyway before ->nr_vnodes got set properly (and all the allocations below don't zero their memory either). > + if ( !*vnuma ) > + return -ENOMEM; > + > + (*vnuma)->vdistance = xmalloc_array(unsigned int, nr_vnodes * nr_vnodes); > + (*vnuma)->vmemrange = xmalloc_array(vmemrange_t, nr_vnodes); > + (*vnuma)->vcpu_to_vnode = xmalloc_array(unsigned int, nr_vcpus); > + (*vnuma)->vnode_to_pnode = xmalloc_array(unsigned int, nr_vnodes); > + > + if ( (*vnuma)->vdistance == NULL || (*vnuma)->vmemrange == NULL || > + (*vnuma)->vcpu_to_vnode == NULL || (*vnuma)->vnode_to_pnode == NULL > ) > + { > + vnuma_destroy(*vnuma); > + return -ENOMEM; > + } > + > + return 0; > +} > + I think vnuma_destroy() would better go here than in a different source file. > +/* > + * Construct vNUMA topology form u_vnuma struct and return > + * it in dst. > + */ > +long vnuma_init(const struct xen_domctl_vnuma *u_vnuma, > + const struct domain *d, > + struct vnuma_info **dst) > +{ > + unsigned int nr_vnodes; > + long ret = -EINVAL; Any reason for this and the function return type being "long" rather than "int"? > + struct vnuma_info *v = NULL; > + > + /* If vNUMA topology already set, just exit. */ > + if ( *dst ) > + return ret; > + > + nr_vnodes = u_vnuma->nr_vnodes; > + > + if ( nr_vnodes == 0 ) > + return ret; > + > + ret = vnuma_alloc(&v, nr_vnodes, d->max_vcpus); If you made use of the definitions in xen/err.h you could avoid the indirection on the first argument (dropping it altogether). > + if ( ret ) > + return ret; > + > + ret = -EFAULT; > + > + if ( copy_from_guest(v->vdistance, u_vnuma->vdistance, > + nr_vnodes * nr_vnodes) ) > + goto vnuma_fail; > + > + if ( copy_from_guest(v->vmemrange, u_vnuma->vmemrange, nr_vnodes) ) > + goto vnuma_fail; Isn't a single memory range per vnode rather limiting? Physical machines frequently have at least one node with two ranges to accommodate the hole below 4Gb. And since the interface is for all guest kinds I'm afraid this will harm setting up guests rather sooner than later. (I'm sorry for thinking of this only now.) > + > + if ( copy_from_guest(v->vcpu_to_vnode, u_vnuma->vcpu_to_vnode, > + d->max_vcpus) ) > + goto vnuma_fail; > + > + if ( copy_from_guest(v->vnode_to_pnode, u_vnuma->vnode_to_pnode, > + nr_vnodes) ) > + goto vnuma_fail; > + > + v->nr_vnodes = nr_vnodes; > + *dst = v; > + > + return 0; > + > + vnuma_fail: > + vnuma_destroy(v); > + return ret; > +} > + > long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) > { > long ret = 0; > @@ -967,6 +1060,35 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) > u_domctl) > } > break; > > + case XEN_DOMCTL_setvnumainfo: > + { > + struct vnuma_info *v = NULL; Bad variable name (normally stands for vCPU). > + > + ret = -EINVAL; > + > + if ( guest_handle_is_null(op->u.vnuma.vdistance) || > + guest_handle_is_null(op->u.vnuma.vmemrange) || > + guest_handle_is_null(op->u.vnuma.vcpu_to_vnode) || > + guest_handle_is_null(op->u.vnuma.vnode_to_pnode) ) { Indentation. Also - is there a precedent to doing such checks in elsewhere? Unless NIL handles have a special meaning we normally just implicitly produce -EFAULT when accessing the usually not mapped memory region at address zero. > --- a/xen/common/memory.c > +++ b/xen/common/memory.c > @@ -969,6 +969,105 @@ long do_memory_op(unsigned long cmd, > XEN_GUEST_HANDLE_PARAM(void) arg) > > break; > > + case XENMEM_get_vnumainfo: > + { > + struct vnuma_topology_info topology; > + struct domain *d; There is a suitable variable already at function scope. > + unsigned int dom_vnodes, dom_vcpus; > + struct vnuma_info vnuma_tmp; Calling this just "tmp" would be quite okay here. > + > + /* > + * guest passes nr_vnodes and nr_vcpus thus > + * we know how much memory guest has allocated. > + */ > + if ( copy_from_guest(&topology, arg, 1) || > + guest_handle_is_null(topology.vmemrange.h) || > + guest_handle_is_null(topology.vdistance.h) || > + guest_handle_is_null(topology.vcpu_to_vnode.h) ) Same two remarks here. > + return -EFAULT; > + > + if ( (d = rcu_lock_domain_by_any_id(topology.domid)) == NULL ) > + return -ESRCH; > + > + rc = -EOPNOTSUPP; This is pointless - the set value is being used only in one error path below where you could as well use the literal. > + > + read_lock(&d->vnuma_rwlock); > + > + if ( d->vnuma == NULL ) > + { > + read_unlock(&d->vnuma_rwlock); > + rcu_unlock_domain(d); > + return rc; > + } > + > + dom_vnodes = d->vnuma->nr_vnodes; > + dom_vcpus = d->max_vcpus; > + > + read_unlock(&d->vnuma_rwlock); > + > + vnuma_tmp.vdistance = xmalloc_array(unsigned int, > + dom_vnodes * dom_vnodes); > + vnuma_tmp.vmemrange = xmalloc_array(vmemrange_t, dom_vnodes); > + vnuma_tmp.vcpu_to_vnode = xmalloc_array(unsigned int, dom_vcpus); > + > + if ( vnuma_tmp.vdistance == NULL || vnuma_tmp.vmemrange == NULL || > + vnuma_tmp.vcpu_to_vnode == NULL ) > + { > + rc = -ENOMEM; > + goto vnumainfo_out; > + } Please use consistent style when the body of the might be just a singe control transfer: Further down you set "rc" before the if, here you do it inside its body. > + > + read_lock(&d->vnuma_rwlock); Even if generally better avoided I think you'd be better off not dropping the lock, as that way you may return inconsistent data to your caller. Or alternatively check that the two counts didn't change by now, starting over if they did. > + > + memcpy(vnuma_tmp.vmemrange, d->vnuma->vmemrange, > + sizeof(*d->vnuma->vmemrange) * dom_vnodes); > + memcpy(vnuma_tmp.vdistance, d->vnuma->vdistance, > + sizeof(*d->vnuma->vdistance) * dom_vnodes * dom_vnodes); > + memcpy(vnuma_tmp.vcpu_to_vnode, d->vnuma->vcpu_to_vnode, > + sizeof(*d->vnuma->vcpu_to_vnode) * dom_vcpus); > + > + read_unlock(&d->vnuma_rwlock); > + > + /* > + * guest nr_cpus and nr_nodes may differ from domain vnuma config. > + * Check here guest nr_nodes and nr_cpus to make sure we dont > overflow. > + */ > + rc = -ENOBUFS; > + if ( topology.nr_vnodes < dom_vnodes || > + topology.nr_vcpus < dom_vcpus ) > + goto vnumainfo_out; Wouldn't this better be done earlier (before allocating any of the intermediate buffers)? And shouldn't you tell the caller the needed values? > + > + rc = -EFAULT; > + > + if ( copy_to_guest(topology.vmemrange.h, vnuma_tmp.vmemrange, > + dom_vnodes) != 0 ) > + goto vnumainfo_out; > + > + if ( copy_to_guest(topology.vdistance.h, vnuma_tmp.vdistance, > + dom_vnodes * dom_vnodes) != 0 ) > + goto vnumainfo_out; > + > + if ( copy_to_guest(topology.vcpu_to_vnode.h, vnuma_tmp.vcpu_to_vnode, > + dom_vcpus) != 0 ) > + goto vnumainfo_out; > + > + topology.nr_vnodes = dom_vnodes; > + topology.nr_vcpus = dom_vcpus; > + > + if ( __copy_to_guest(arg, &topology, 1) != 0 ) > + goto vnumainfo_out; > + > + rc = 0; > + > + vnumainfo_out: > + rcu_unlock_domain(d); > + > + xfree(vnuma_tmp.vdistance); > + xfree(vnuma_tmp.vmemrange); > + xfree(vnuma_tmp.vcpu_to_vnode); > + break; > + } > + > default: > rc = arch_memory_op(cmd, arg); > break; > diff --git a/xen/include/public/arch-x86/xen.h > b/xen/include/public/arch-x86/xen.h > index f35804b..a4c3d58 100644 > --- a/xen/include/public/arch-x86/xen.h > +++ b/xen/include/public/arch-x86/xen.h > @@ -108,6 +108,15 @@ typedef unsigned long xen_pfn_t; > /* Maximum number of virtual CPUs in legacy multi-processor guests. */ > #define XEN_LEGACY_MAX_VCPUS 32 > > +/* > + * Maximum number of virtual NUMA nodes per domain. > + * This restriction is related to a security advice > + * XSA-77 and max xmalloc size of PAGE_SIZE. This limit > + * avoids multi page allocation for vnuma. This limit > + * will be increased in next version. > + */ > +#define XEN_MAX_VNODES 32 This is pointless (and perhaps in the wrong header) - it's purely an implementation detail that there currently is such a limit. You don't even need the #define in an internal header afaict, since the only place it's being used could easily do a suitable check against PAGE_SIZE. > --- a/xen/include/public/memory.h > +++ b/xen/include/public/memory.h > @@ -521,9 +521,54 @@ DEFINE_XEN_GUEST_HANDLE(xen_mem_sharing_op_t); > * The zero value is appropiate. > */ > > +/* vNUMA node memory range */ > +struct vmemrange { > + uint64_t start, end; > +}; > + > +typedef struct vmemrange vmemrange_t; > +DEFINE_XEN_GUEST_HANDLE(vmemrange_t); > + > +/* > + * vNUMA topology specifies vNUMA node number, distance table, > + * memory ranges and vcpu mapping provided for guests. > + * XENMEM_get_vnumainfo hypercall expects to see from guest > + * nr_vnodes and nr_vcpus to indicate available memory. After > + * filling guests structures, nr_vnodes and nr_vcpus copied > + * back to guest. > + */ > +struct vnuma_topology_info { > + /* IN */ > + domid_t domid; > + /* IN/OUT */ > + unsigned int nr_vnodes; Please make the padding between above two fields explicit. > --- a/xen/include/xen/sched.h > +++ b/xen/include/xen/sched.h > @@ -452,6 +452,10 @@ struct domain > nodemask_t node_affinity; > unsigned int last_alloc_node; > spinlock_t node_affinity_lock; > + > + /* vNUMA topology accesses are protected by rwlock. */ > + rwlock_t vnuma_rwlock; I think there's no need for the "rw" in the field name. Jan _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.