Xen project Mailing List

Re: [Xen-devel] [PATCH v6 01/10] xen: vnuma topology and subop hypercalls

On Fri, Jul 18, 2014 at 9:49 AM, Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx> wrote: > On Fri, Jul 18, 2014 at 01:50:00AM -0400, Elena Ufimtseva wrote: >> Define interface, structures and hypercalls for toolstack to >> build vnuma topology and for guests that wish to retrieve it. >> Two subop hypercalls introduced by patch: >> XEN_DOMCTL_setvnumainfo to define vNUMA domain topology per domain >> and XENMEM_get_vnumainfo to retrieve that topology by guest. >> >> Signed-off-by: Elena Ufimtseva <ufimtseva@xxxxxxxxx> >> --- >> xen/common/domain.c | 13 ++++ >> xen/common/domctl.c | 167 >> +++++++++++++++++++++++++++++++++++++++++++ >> xen/common/memory.c | 62 ++++++++++++++++ >> xen/include/public/domctl.h | 29 ++++++++ >> xen/include/public/memory.h | 47 +++++++++++- >> xen/include/xen/domain.h | 11 +++ >> xen/include/xen/sched.h | 1 + >> 7 files changed, 329 insertions(+), 1 deletion(-) >> >> diff --git a/xen/common/domain.c b/xen/common/domain.c >> index cd64aea..895584a 100644 >> --- a/xen/common/domain.c >> +++ b/xen/common/domain.c >> @@ -584,6 +584,18 @@ int rcu_lock_live_remote_domain_by_id(domid_t dom, >> struct domain **d) >> return 0; >> } >> >> +void vnuma_destroy(struct vnuma_info *vnuma) >> +{ >> + if ( vnuma ) >> + { >> + xfree(vnuma->vmemrange); >> + xfree(vnuma->vcpu_to_vnode); >> + xfree(vnuma->vdistance); >> + xfree(vnuma->vnode_to_pnode); >> + xfree(vnuma); >> + } >> +} >> + >> int domain_kill(struct domain *d) >> { >> int rc = 0; >> @@ -602,6 +614,7 @@ int domain_kill(struct domain *d) >> evtchn_destroy(d); >> gnttab_release_mappings(d); >> tmem_destroy(d->tmem_client); >> + vnuma_destroy(d->vnuma); >> domain_set_outstanding_pages(d, 0); >> d->tmem_client = NULL; >> /* fallthrough */ >> diff --git a/xen/common/domctl.c b/xen/common/domctl.c >> index c326aba..7464284 100644 >> --- a/xen/common/domctl.c >> +++ b/xen/common/domctl.c >> @@ -297,6 +297,144 @@ int vcpuaffinity_params_invalid(const >> xen_domctl_vcpuaffinity_t *vcpuaff) >> guest_handle_is_null(vcpuaff->cpumap_soft.bitmap)); >> } >> >> +/* >> + * Allocates memory for vNUMA, **vnuma should be NULL. >> + * Caller has to make sure that domain has max_pages >> + * and number of vcpus set for domain. >> + * Verifies that single allocation does not exceed >> + * PAGE_SIZE. >> + */ >> +static int vnuma_alloc(struct vnuma_info **vnuma, >> + unsigned int nr_vnodes, >> + unsigned int nr_vcpus, >> + unsigned int dist_size) >> +{ >> + struct vnuma_info *v; >> + >> + if ( vnuma && *vnuma ) >> + return -EINVAL; >> + >> + v = *vnuma; >> + /* >> + * check if any of xmallocs exeeds PAGE_SIZE. >> + * If yes, consider it as an error for now. >> + */ >> + if ( nr_vnodes > PAGE_SIZE / sizeof(nr_vnodes) || >> + nr_vcpus > PAGE_SIZE / sizeof(nr_vcpus) || >> + nr_vnodes > PAGE_SIZE / sizeof(struct vmemrange) || >> + dist_size > PAGE_SIZE / sizeof(dist_size) ) >> + return -EINVAL; >> + >> + v = xzalloc(struct vnuma_info); >> + if ( !v ) >> + return -ENOMEM; >> + >> + v->vdistance = xmalloc_array(unsigned int, dist_size); >> + v->vmemrange = xmalloc_array(vmemrange_t, nr_vnodes); >> + v->vcpu_to_vnode = xmalloc_array(unsigned int, nr_vcpus); >> + v->vnode_to_pnode = xmalloc_array(unsigned int, nr_vnodes); >> + >> + if ( v->vdistance == NULL || v->vmemrange == NULL || >> + v->vcpu_to_vnode == NULL || v->vnode_to_pnode == NULL ) >> + { >> + vnuma_destroy(v); >> + return -ENOMEM; >> + } >> + >> + *vnuma = v; >> + >> + return 0; >> +} >> + >> +/* >> + * Allocate memory and construct one vNUMA node, >> + * set default parameters, assign all memory and >> + * vcpus to this node, set distance to 10. >> + */ >> +static long vnuma_fallback(const struct domain *d, >> + struct vnuma_info **vnuma) >> +{ >> + struct vnuma_info *v; >> + long ret; >> + >> + >> + /* Will not destroy vNUMA here, destroy before calling this. */ >> + if ( vnuma && *vnuma ) >> + return -EINVAL; >> + >> + v = *vnuma; >> + ret = vnuma_alloc(&v, 1, d->max_vcpus, 1); >> + if ( ret ) >> + return ret; >> + >> + v->vmemrange[0].start = 0; >> + v->vmemrange[0].end = d->max_pages << PAGE_SHIFT; >> + v->vdistance[0] = 10; >> + v->vnode_to_pnode[0] = NUMA_NO_NODE; >> + memset(v->vcpu_to_vnode, 0, d->max_vcpus); >> + v->nr_vnodes = 1; >> + >> + *vnuma = v; >> + >> + return 0; >> +} >> + >> +/* >> + * construct vNUMA topology form u_vnuma struct and return >> + * it in dst. >> + */ >> +long vnuma_init(const struct xen_domctl_vnuma *u_vnuma, >> + const struct domain *d, >> + struct vnuma_info **dst) >> +{ >> + unsigned int dist_size, nr_vnodes = 0; >> + long ret; >> + struct vnuma_info *v = NULL; >> + >> + ret = -EINVAL; >> + >> + /* If vNUMA topology already set, just exit. */ >> + if ( !u_vnuma || *dst ) >> + return ret; >> + >> + nr_vnodes = u_vnuma->nr_vnodes; >> + >> + if ( nr_vnodes == 0 ) >> + return ret; >> + >> + if ( nr_vnodes > (UINT_MAX / nr_vnodes) ) >> + return ret; >> + >> + dist_size = nr_vnodes * nr_vnodes; >> + >> + ret = vnuma_alloc(&v, nr_vnodes, d->max_vcpus, dist_size); >> + if ( ret ) >> + return ret; >> + >> + /* On failure, set only one vNUMA node and its success. */ >> + ret = 0; >> + >> + if ( copy_from_guest(v->vdistance, u_vnuma->vdistance, dist_size) ) >> + goto vnuma_onenode; >> + if ( copy_from_guest(v->vmemrange, u_vnuma->vmemrange, nr_vnodes) ) >> + goto vnuma_onenode; >> + if ( copy_from_guest(v->vcpu_to_vnode, u_vnuma->vcpu_to_vnode, >> + d->max_vcpus) ) >> + goto vnuma_onenode; >> + if ( copy_from_guest(v->vnode_to_pnode, u_vnuma->vnode_to_pnode, >> + nr_vnodes) ) >> + goto vnuma_onenode; >> + >> + v->nr_vnodes = nr_vnodes; >> + *dst = v; >> + >> + return ret; >> + >> +vnuma_onenode: >> + vnuma_destroy(v); >> + return vnuma_fallback(d, dst); >> +} >> + >> long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) >> { >> long ret = 0; >> @@ -967,6 +1105,35 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) >> u_domctl) >> } >> break; >> >> + case XEN_DOMCTL_setvnumainfo: >> + { >> + struct vnuma_info *v = NULL; >> + >> + ret = -EFAULT; >> + if ( guest_handle_is_null(op->u.vnuma.vdistance) || >> + guest_handle_is_null(op->u.vnuma.vmemrange) || >> + guest_handle_is_null(op->u.vnuma.vcpu_to_vnode) || >> + guest_handle_is_null(op->u.vnuma.vnode_to_pnode) ) >> + return ret; >> + >> + ret = -EINVAL; >> + >> + ret = vnuma_init(&op->u.vnuma, d, &v); >> + if ( ret < 0 || v == NULL ) >> + break; >> + >> + /* overwrite vnuma for domain */ >> + if ( !d->vnuma ) > > You want that in within the domain_lock. > > Otherwise an caller (on another CPU) could try to read the > d->vnuma and blow up. Say by using the serial console and > wanting to read the guest vNUMA topology. > >> + vnuma_destroy(d->vnuma); >> + >> + domain_lock(d); > > I would just do > > vnuma_destroy(d->vnuma) > > here and remove the 'if' above. >> + d->vnuma = v; >> + domain_unlock(d); >> + >> + ret = 0; >> + } >> + break; >> + Agree and done ) >> default: >> ret = arch_do_domctl(op, d, u_domctl); >> break; >> diff --git a/xen/common/memory.c b/xen/common/memory.c >> index c2dd31b..925b9fc 100644 >> --- a/xen/common/memory.c >> +++ b/xen/common/memory.c >> @@ -969,6 +969,68 @@ long do_memory_op(unsigned long cmd, >> XEN_GUEST_HANDLE_PARAM(void) arg) >> >> break; >> >> + case XENMEM_get_vnumainfo: >> + { >> + struct vnuma_topology_info topology; >> + struct domain *d; >> + unsigned int dom_vnodes = 0; >> + >> + /* >> + * guest passes nr_vnodes and nr_vcpus thus >> + * we know how much memory guest has allocated. >> + */ >> + if ( copy_from_guest(&topology, arg, 1) || >> + guest_handle_is_null(topology.vmemrange.h) || >> + guest_handle_is_null(topology.vdistance.h) || >> + guest_handle_is_null(topology.vcpu_to_vnode.h) ) >> + return -EFAULT; >> + >> + if ( (d = rcu_lock_domain_by_any_id(topology.domid)) == NULL ) >> + return -ESRCH; >> + >> + rc = -EOPNOTSUPP; >> + if ( d->vnuma == NULL ) >> + goto vnumainfo_out; >> + >> + if ( d->vnuma->nr_vnodes == 0 ) >> + goto vnumainfo_out; >> + >> + dom_vnodes = d->vnuma->nr_vnodes; >> + >> + /* >> + * guest nr_cpus and nr_nodes may differ from domain vnuma config. >> + * Check here guest nr_nodes and nr_cpus to make sure we dont >> overflow. >> + */ >> + rc = -ENOBUFS; >> + if ( topology.nr_vnodes < dom_vnodes || >> + topology.nr_vcpus < d->max_vcpus ) >> + goto vnumainfo_out; >> + >> + rc = -EFAULT; >> + >> + if ( copy_to_guest(topology.vmemrange.h, d->vnuma->vmemrange, >> + dom_vnodes) != 0 ) >> + goto vnumainfo_out; >> + >> + if ( copy_to_guest(topology.vdistance.h, d->vnuma->vdistance, >> + dom_vnodes * dom_vnodes) != 0 ) >> + goto vnumainfo_out; >> + >> + if ( copy_to_guest(topology.vcpu_to_vnode.h, >> d->vnuma->vcpu_to_vnode, >> + d->max_vcpus) != 0 ) >> + goto vnumainfo_out; >> + >> + topology.nr_vnodes = dom_vnodes; >> + >> + if ( copy_to_guest(arg, &topology, 1) != 0 ) >> + goto vnumainfo_out; >> + rc = 0; >> + >> + vnumainfo_out: >> + rcu_unlock_domain(d); >> + break; >> + } >> + >> default: >> rc = arch_memory_op(cmd, arg); >> break; >> diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h >> index 5b11bbf..5ee74f4 100644 >> --- a/xen/include/public/domctl.h >> +++ b/xen/include/public/domctl.h >> @@ -35,6 +35,7 @@ >> #include "xen.h" >> #include "grant_table.h" >> #include "hvm/save.h" >> +#include "memory.h" >> >> #define XEN_DOMCTL_INTERFACE_VERSION 0x0000000a >> >> @@ -934,6 +935,32 @@ struct xen_domctl_vcpu_msrs { >> }; >> typedef struct xen_domctl_vcpu_msrs xen_domctl_vcpu_msrs_t; >> DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpu_msrs_t); >> + >> +/* >> + * Use in XEN_DOMCTL_setvnumainfo to set >> + * vNUMA domain topology. >> + */ >> +struct xen_domctl_vnuma { >> + uint32_t nr_vnodes; >> + uint32_t _pad; >> + XEN_GUEST_HANDLE_64(uint) vdistance; >> + XEN_GUEST_HANDLE_64(uint) vcpu_to_vnode; >> + >> + /* >> + * vnodes to physical NUMA nodes mask. >> + * This kept on per-domain basis for >> + * interested consumers, such as numa aware ballooning. >> + */ >> + XEN_GUEST_HANDLE_64(uint) vnode_to_pnode; >> + >> + /* >> + * memory rages for each vNUMA node >> + */ >> + XEN_GUEST_HANDLE_64(vmemrange_t) vmemrange; >> +}; >> +typedef struct xen_domctl_vnuma xen_domctl_vnuma_t; >> +DEFINE_XEN_GUEST_HANDLE(xen_domctl_vnuma_t); >> + >> #endif >> >> struct xen_domctl { >> @@ -1008,6 +1035,7 @@ struct xen_domctl { >> #define XEN_DOMCTL_cacheflush 71 >> #define XEN_DOMCTL_get_vcpu_msrs 72 >> #define XEN_DOMCTL_set_vcpu_msrs 73 >> +#define XEN_DOMCTL_setvnumainfo 74 >> #define XEN_DOMCTL_gdbsx_guestmemio 1000 >> #define XEN_DOMCTL_gdbsx_pausevcpu 1001 >> #define XEN_DOMCTL_gdbsx_unpausevcpu 1002 >> @@ -1068,6 +1096,7 @@ struct xen_domctl { >> struct xen_domctl_cacheflush cacheflush; >> struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu; >> struct xen_domctl_gdbsx_domstatus gdbsx_domstatus; >> + struct xen_domctl_vnuma vnuma; >> uint8_t pad[128]; >> } u; >> }; >> diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h >> index 2c57aa0..2c212e1 100644 >> --- a/xen/include/public/memory.h >> +++ b/xen/include/public/memory.h >> @@ -521,9 +521,54 @@ DEFINE_XEN_GUEST_HANDLE(xen_mem_sharing_op_t); >> * The zero value is appropiate. >> */ >> >> +/* vNUMA node memory range */ >> +struct vmemrange { >> + uint64_t start, end; >> +}; >> + >> +typedef struct vmemrange vmemrange_t; >> +DEFINE_XEN_GUEST_HANDLE(vmemrange_t); >> + >> +/* >> + * vNUMA topology specifies vNUMA node number, distance table, >> + * memory ranges and vcpu mapping provided for guests. >> + * XENMEM_get_vnumainfo hypercall expects to see from guest >> + * nr_vnodes and nr_vcpus to indicate available memory. After >> + * filling guests structures, nr_vnodes and nr_vcpus copied >> + * back to guest. >> + */ >> +struct vnuma_topology_info { >> + /* IN */ >> + domid_t domid; >> + /* IN/OUT */ >> + unsigned int nr_vnodes; >> + unsigned int nr_vcpus; >> + /* OUT */ >> + union { >> + XEN_GUEST_HANDLE(uint) h; >> + uint64_t pad; >> + } vdistance; >> + union { >> + XEN_GUEST_HANDLE(uint) h; >> + uint64_t pad; >> + } vcpu_to_vnode; >> + union { >> + XEN_GUEST_HANDLE(vmemrange_t) h; >> + uint64_t pad; >> + } vmemrange; >> +}; >> +typedef struct vnuma_topology_info vnuma_topology_info_t; >> +DEFINE_XEN_GUEST_HANDLE(vnuma_topology_info_t); >> + >> +/* >> + * XENMEM_get_vnumainfo used by guest to get >> + * vNUMA topology from hypervisor. >> + */ >> +#define XENMEM_get_vnumainfo 26 >> + >> #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */ >> >> -/* Next available subop number is 26 */ >> +/* Next available subop number is 27 */ >> >> #endif /* __XEN_PUBLIC_MEMORY_H__ */ >> >> diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h >> index bb1c398..d29a84d 100644 >> --- a/xen/include/xen/domain.h >> +++ b/xen/include/xen/domain.h >> @@ -89,4 +89,15 @@ extern unsigned int xen_processor_pmbits; >> >> extern bool_t opt_dom0_vcpus_pin; >> >> +/* vnuma topology per domain. */ >> +struct vnuma_info { >> + unsigned int nr_vnodes; >> + unsigned int *vdistance; >> + unsigned int *vcpu_to_vnode; >> + unsigned int *vnode_to_pnode; >> + struct vmemrange *vmemrange; >> +}; >> + >> +void vnuma_destroy(struct vnuma_info *vnuma); >> + >> #endif /* __XEN_DOMAIN_H__ */ >> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h >> index d5bc461..71e4218 100644 >> --- a/xen/include/xen/sched.h >> +++ b/xen/include/xen/sched.h >> @@ -447,6 +447,7 @@ struct domain >> nodemask_t node_affinity; >> unsigned int last_alloc_node; >> spinlock_t node_affinity_lock; >> + struct vnuma_info *vnuma; >> }; >> >> struct domain_setup_info >> -- >> 1.7.10.4 >> -- Elena _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.