[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [RFC][PATCH 1/2] export NUMA topology from xen
* Ryan Harper <ryanh@xxxxxxxxxx> [2007-04-24 10:30]: > * Keir Fraser <keir@xxxxxxxxxxxxx> [2007-04-10 04:13]: > > On 10/4/07 02:09, "Ryan Harper" <ryanh@xxxxxxxxxx> wrote: > > > > > nr_nodes : 4 > > > mem_chunks : node0:0x0000000000000000-0x0000000190000000 > > > node1:0x0000000190000000-0x0000000300000000 > > > node2:0x0000000300000000-0x0000000470000000 > > > node3:0x0000000470000000-0x0000000640000000 > > > node_to_cpu : node0:0-7 > > > node1:8-15 > > > node2:16-23 > > > node3:24-31 > > > > > > I've also reworked the the physinfo call to contain an array of > > > cpu_to_node elements rather than node_to_cpu to support machines larger > > > than 64-way. I convert the array back to node_to_cpu for brevity in > > > xm info display. > > > > The same would make sense for memory regions (i.e., have a list of > > memory-regions and include a node identifier for each one, rather than > > mapping node-id to memory-region) as this would allow to have multiple > > memory regions per node quite easily. But actually I'm not convinced that > > allowing dom0 to read out the physical addresses of memory regions is at all > > useful -- why would anyone care which particular physical address ranges > > belong to a particular node? The hypercall to find amount of free memory per > > node seems more useful, and probably sufficient by itself. > > Updated. > > - Dropped mem_chunks (removed that from existing ia64 NUMA physinfo) > - Fixed up ia64 cpu_to_node_map array size (was MAX_NUMNODES, now > NR_CPUS) > - Fixed sockets_per_node calculation (was bogus on Opteron systems) > - Updated all arches physinfo call to use num_online_nodes() and new > sockets_per_node calculation > > Untested on ia64, ppc. Refreshed to changeset: 15200:bd3d6b4c52ec -- Ryan Harper Software Engineer; Linux Technology Center IBM Corp., Austin, Tx (512) 838-9253 T/L: 678-9253 ryanh@xxxxxxxxxx diffstat output: tools/libxc/xc_misc.c | 3 tools/libxc/xenctrl.h | 1 tools/python/xen/lowlevel/xc/xc.c | 61 ++++++++++++++++---- tools/python/xen/xend/XendNode.py | 50 ++++++++++++++++ tools/xenmon/xenbaked.c | 3 tools/xenstat/libxenstat/src/xenstat.c | 3 tools/xentrace/xentrace.c | 3 tools/xm-test/tests/info/02_info_compiledata_pos.py | 4 - xen/arch/ia64/xen/dom0_ops.c | 46 +-------------- xen/arch/powerpc/sysctl.c | 6 - xen/arch/x86/sysctl.c | 33 +++++++++- xen/include/public/sysctl.h | 1 12 files changed, 152 insertions(+), 62 deletions(-) Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx> --- This patch modifies the physinfo hcall to export NUMA cpu_to_node topology information. The new physinfo hcall is integrated into libxc and xend (xm info specifically). Included in this patch is a minor tweak to xm-test's xm info testcase. I've also fixed the sockets_per_node calculation. The new fields in xm info are: nr_cpus : 32 nr_nodes : 4 sockets_per_node : 4 cores_per_socket : 1 threads_per_core : 2 ... node_to_cpu : node0:0-7 node1:8-15 node2:16-23 node3:24-31 I've also reworked the the physinfo call to contain an array of cpu_to_node elements rather than node_to_cpu to support machines larger than 64-ways. I convert the array back to node_to_cpu for brevity in xm info display. Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx> diff -r 400a3dca237e tools/libxc/xc_misc.c --- a/tools/libxc/xc_misc.c Mon Apr 09 12:05:26 2007 +0100 +++ b/tools/libxc/xc_misc.c Fri Apr 13 13:04:24 2007 -0500 @@ -59,6 +59,9 @@ int xc_physinfo(int xc_handle, DECLARE_SYSCTL; sysctl.cmd = XEN_SYSCTL_physinfo; + + /* set pointers to caller's so memcpy doesn't clobber them */ + sysctl.u.physinfo.cpu_to_node = put_info->cpu_to_node; if ( (ret = do_sysctl(xc_handle, &sysctl)) != 0 ) return ret; diff -r 400a3dca237e tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h Mon Apr 09 12:05:26 2007 +0100 +++ b/tools/libxc/xenctrl.h Fri Apr 13 13:04:24 2007 -0500 @@ -473,6 +473,7 @@ int xc_send_debug_keys(int xc_handle, ch int xc_send_debug_keys(int xc_handle, char *keys); typedef xen_sysctl_physinfo_t xc_physinfo_t; +typedef uint32_t xc_cpu_to_node_t; int xc_physinfo(int xc_handle, xc_physinfo_t *info); diff -r 400a3dca237e tools/python/xen/lowlevel/xc/xc.c --- a/tools/python/xen/lowlevel/xc/xc.c Mon Apr 09 12:05:26 2007 +0100 +++ b/tools/python/xen/lowlevel/xc/xc.c Fri Apr 13 15:41:39 2007 -0500 @@ -640,14 +640,26 @@ static PyObject *pyxc_pages_to_kib(XcObj } +#define MAX_NR_CPUS 256 static PyObject *pyxc_physinfo(XcObject *self) { xc_physinfo_t info; char cpu_cap[128], *p=cpu_cap, *q=cpu_cap; - int i; + int i,j, nr_cpus; + PyObject *ret_obj, *node_to_cpu_obj; + xc_cpu_to_node_t *map; + + /* make space for cpu_to_node mapping, up to MAX_NR_CPUS cpus */ + map = (xc_cpu_to_node_t *)malloc( sizeof(xc_cpu_to_node_t) * MAX_NR_CPUS); + + set_xen_guest_handle(info.cpu_to_node, map); if ( xc_physinfo(self->xc_handle, &info) != 0 ) return pyxc_error_to_exception(); + + /* calc number of cpus */ + nr_cpus = info.threads_per_core * info.cores_per_socket * + info.sockets_per_node * info.nr_nodes; *q=0; for(i=0;i<sizeof(info.hw_cap)/4;i++) @@ -659,16 +671,43 @@ static PyObject *pyxc_physinfo(XcObject if(q>cpu_cap) *(q-1)=0; - return Py_BuildValue("{s:i,s:i,s:i,s:i,s:l,s:l,s:l,s:i,s:s}", - "threads_per_core", info.threads_per_core, - "cores_per_socket", info.cores_per_socket, - "sockets_per_node", info.sockets_per_node, - "nr_nodes", info.nr_nodes, - "total_memory", pages_to_kib(info.total_pages), - "free_memory", pages_to_kib(info.free_pages), - "scrub_memory", pages_to_kib(info.scrub_pages), - "cpu_khz", info.cpu_khz, - "hw_caps", cpu_cap); + ret_obj = Py_BuildValue("{s:i,s:i,s:i,s:l,s:l,s:l,s:i,s:s}", + "threads_per_core", info.threads_per_core, + "cores_per_socket", info.cores_per_socket, + "sockets_per_node", info.sockets_per_node, + "total_memory", pages_to_kib(info.total_pages), + "free_memory", pages_to_kib(info.free_pages), + "scrub_memory", pages_to_kib(info.scrub_pages), + "cpu_khz", info.cpu_khz, + "hw_caps", cpu_cap); + + /* node to cpu mappings */ + node_to_cpu_obj = PyList_New(0); + + /* make a list for each node */ + for ( i=0; i<info.nr_nodes; i++) + { + PyObject *cpus = PyList_New(0); + + /* walk the cpu_to_node array, for each cpu + which maps to node i, add to cpus list */ + for ( j=0; j<nr_cpus; j++) + { + /* this cpu j maps to node i */ + if ( i == (uint32_t)map[j]) + PyList_Append(cpus, PyInt_FromLong(j)); + } + PyList_Append(node_to_cpu_obj, cpus); + } + /* add list of node to cpu mappings and nr_nodes to physinfo dictionary */ + PyDict_SetItemString(ret_obj, "node_to_cpu", node_to_cpu_obj); + PyDict_SetItemString(ret_obj, "nr_nodes", + Py_BuildValue("i", info.nr_nodes)); + + /* free malloc'd memory */ + free(map); + + return ret_obj; } static PyObject *pyxc_xeninfo(XcObject *self) diff -r 400a3dca237e tools/python/xen/xend/XendNode.py --- a/tools/python/xen/xend/XendNode.py Mon Apr 09 12:05:26 2007 +0100 +++ b/tools/python/xen/xend/XendNode.py Fri Apr 13 13:04:24 2007 -0500 @@ -547,6 +547,54 @@ class XendNode: ['version', ver], ['machine', mch]] + def list_to_rangepairs(self,cmap): + cmap.sort() + pairs = [] + x = y = 0 + for i in range(0,len(cmap)): + try: + if ((cmap[y+1] - cmap[i]) > 1): + pairs.append((cmap[x],cmap[y])) + x = y = i+1 + else: + y = y + 1 + # if we go off the end, then just add x to y + except IndexError: + pairs.append((cmap[x],cmap[y])) + + return pairs + + def format_pairs(self,pairs): + if not pairs: + return "no cpus" + out = "" + for f,s in pairs: + if (f==s): + out += '%d'%f + else: + out += '%d-%d'%(f,s) + out += ',' + # trim trailing ',' + return out[:-1] + + def list_to_strrange(self,list): + return self.format_pairs(self.list_to_rangepairs(list)) + + def format_node_to_cpu(self, pinfo): + str='' + whitespace='' + try: + node_to_cpu=pinfo['node_to_cpu'] + for i in range(0, pinfo['nr_nodes']): + str+='%snode%d:%s\n' % (whitespace, + i, + self.list_to_strrange(node_to_cpu[i])) + whitespace='%25s' % '' + except: + str='none\n' + return str[:-1]; + + def physinfo(self): info = self.xc.physinfo() @@ -559,6 +607,7 @@ class XendNode: # physinfo is in KiB, need it in MiB info['total_memory'] = info['total_memory'] / 1024 info['free_memory'] = info['free_memory'] / 1024 + info['node_to_cpu'] = self.format_node_to_cpu(info) ITEM_ORDER = ['nr_cpus', 'nr_nodes', @@ -569,6 +618,7 @@ class XendNode: 'hw_caps', 'total_memory', 'free_memory', + 'node_to_cpu' ] return [[k, info[k]] for k in ITEM_ORDER] diff -r 400a3dca237e tools/xenmon/xenbaked.c --- a/tools/xenmon/xenbaked.c Mon Apr 09 12:05:26 2007 +0100 +++ b/tools/xenmon/xenbaked.c Fri Apr 13 13:04:24 2007 -0500 @@ -448,6 +448,9 @@ unsigned int get_num_cpus(void) int xc_handle = xc_interface_open(); int ret; + /* ensure node_to_cpu is NULL */ + memset(&physinfo, 0, sizeof(physinfo)); + ret = xc_physinfo(xc_handle, &physinfo); if ( ret != 0 ) diff -r 400a3dca237e tools/xenstat/libxenstat/src/xenstat.c --- a/tools/xenstat/libxenstat/src/xenstat.c Mon Apr 09 12:05:26 2007 +0100 +++ b/tools/xenstat/libxenstat/src/xenstat.c Fri Apr 13 13:04:24 2007 -0500 @@ -147,6 +147,9 @@ xenstat_node *xenstat_get_node(xenstat_h /* Store the handle in the node for later access */ node->handle = handle; + + /* ensure node_to_cpu is NULL */ + memset(&physinfo, 0, sizeof(physinfo)); /* Get information about the physical system */ if (xc_physinfo(handle->xc_handle, &physinfo) < 0) { diff -r 400a3dca237e tools/xentrace/xentrace.c --- a/tools/xentrace/xentrace.c Mon Apr 09 12:05:26 2007 +0100 +++ b/tools/xentrace/xentrace.c Fri Apr 13 13:04:24 2007 -0500 @@ -260,6 +260,9 @@ unsigned int get_num_cpus(void) int xc_handle = xc_interface_open(); int ret; + /* ensure node_to_cpu is NULL */ + memset(&physinfo, 0, sizeof(physinfo)); + ret = xc_physinfo(xc_handle, &physinfo); if ( ret != 0 ) diff -r 400a3dca237e tools/xm-test/tests/info/02_info_compiledata_pos.py --- a/tools/xm-test/tests/info/02_info_compiledata_pos.py Mon Apr 09 12:05:26 2007 +0100 +++ b/tools/xm-test/tests/info/02_info_compiledata_pos.py Fri Apr 13 13:04:24 2007 -0500 @@ -18,9 +18,7 @@ for line in lines: for line in lines: pieces = line.split(" : ", 1) - if len(pieces) < 2: - FAIL("Found invalid line: [%s]" % line) - else: + if len(pieces) > 1: map[pieces[0]] = pieces[1] for field in ["cores_per_socket", "threads_per_core", "cpu_mhz", diff -r 400a3dca237e xen/arch/ia64/xen/dom0_ops.c --- a/xen/arch/ia64/xen/dom0_ops.c Mon Apr 09 12:05:26 2007 +0100 +++ b/xen/arch/ia64/xen/dom0_ops.c Fri Apr 13 13:20:38 2007 -0500 @@ -239,8 +239,7 @@ long arch_do_sysctl(xen_sysctl_t *op, XE { #ifdef IA64_NUMA_PHYSINFO int i; - node_data_t *chunks; - u64 *map, cpu_to_node_map[MAX_NUMNODES]; + uint32_t *map, cpu_to_node_map[NR_CPUS]; #endif xen_sysctl_physinfo_t *pi = &op->u.physinfo; @@ -249,11 +248,9 @@ long arch_do_sysctl(xen_sysctl_t *op, XE cpus_weight(cpu_sibling_map[0]); pi->cores_per_socket = cpus_weight(cpu_core_map[0]) / pi->threads_per_core; - pi->sockets_per_node = - num_online_cpus() / cpus_weight(cpu_core_map[0]); -#ifndef IA64_NUMA_PHYSINFO - pi->nr_nodes = 1; -#endif + pi->nr_nodes = num_online_nodes(); + pi->sockets_per_node = num_online_cpus() / + (pi->nr_nodes * pi->cores_per_socket * pi->threads_per_core); pi->total_pages = total_pages; pi->free_pages = avail_domheap_pages(); pi->scrub_pages = avail_scrub_pages(); @@ -263,41 +260,6 @@ long arch_do_sysctl(xen_sysctl_t *op, XE ret = 0; #ifdef IA64_NUMA_PHYSINFO - /* fetch memory_chunk pointer from guest */ - get_xen_guest_handle(chunks, pi->memory_chunks); - - printk("chunks=%p, num_node_memblks=%u\n", chunks, num_node_memblks); - /* if it is set, fill out memory chunk array */ - if (chunks != NULL) { - if (num_node_memblks == 0) { - /* Non-NUMA machine. Put pseudo-values. */ - node_data_t data; - data.node_start_pfn = 0; - data.node_spanned_pages = total_pages; - data.node_id = 0; - /* copy memory chunk structs to guest */ - if (copy_to_guest_offset(pi->memory_chunks, 0, &data, 1)) { - ret = -EFAULT; - break; - } - } else { - for (i = 0; i < num_node_memblks && i < PUBLIC_MAXCHUNKS; i++) { - node_data_t data; - data.node_start_pfn = node_memblk[i].start_paddr >> - PAGE_SHIFT; - data.node_spanned_pages = node_memblk[i].size >> PAGE_SHIFT; - data.node_id = node_memblk[i].nid; - /* copy memory chunk structs to guest */ - if (copy_to_guest_offset(pi->memory_chunks, i, &data, 1)) { - ret = -EFAULT; - break; - } - } - } - } - /* set number of notes */ - pi->nr_nodes = num_online_nodes(); - /* fetch cpu_to_node pointer from guest */ get_xen_guest_handle(map, pi->cpu_to_node); diff -r 400a3dca237e xen/arch/powerpc/sysctl.c --- a/xen/arch/powerpc/sysctl.c Mon Apr 09 12:05:26 2007 +0100 +++ b/xen/arch/powerpc/sysctl.c Fri Apr 13 13:09:31 2007 -0500 @@ -45,10 +45,10 @@ long arch_do_sysctl(struct xen_sysctl *s cpus_weight(cpu_sibling_map[0]); pi->cores_per_socket = cpus_weight(cpu_core_map[0]) / pi->threads_per_core; - pi->sockets_per_node = - num_online_cpus() / cpus_weight(cpu_core_map[0]); + pi->sockets_per_node = num_online_cpus() / + (num_online_nodes() * pi->cores_per_socket * pi->threads_per_core); - pi->nr_nodes = 1; + pi->nr_nodes = num_online_nodes(); pi->total_pages = total_pages; pi->free_pages = avail_domheap_pages(); pi->cpu_khz = cpu_khz; diff -r 400a3dca237e xen/arch/x86/sysctl.c --- a/xen/arch/x86/sysctl.c Mon Apr 09 12:05:26 2007 +0100 +++ b/xen/arch/x86/sysctl.c Fri Apr 13 13:11:15 2007 -0500 @@ -23,6 +23,10 @@ #include <asm/hvm/hvm.h> #include <asm/hvm/support.h> #include <asm/processor.h> +#include <asm/numa.h> +#include <xen/nodemask.h> + +#define get_xen_guest_handle(val, hnd) do { val = (hnd).p; } while (0) long arch_do_sysctl( struct xen_sysctl *sysctl, XEN_GUEST_HANDLE(xen_sysctl_t) u_sysctl) @@ -34,16 +38,19 @@ long arch_do_sysctl( case XEN_SYSCTL_physinfo: { + int i; + uint32_t *map, cpu_to_node_map[NR_CPUS]; + xen_sysctl_physinfo_t *pi = &sysctl->u.physinfo; pi->threads_per_core = cpus_weight(cpu_sibling_map[0]); pi->cores_per_socket = cpus_weight(cpu_core_map[0]) / pi->threads_per_core; - pi->sockets_per_node = - num_online_cpus() / cpus_weight(cpu_core_map[0]); + pi->nr_nodes = num_online_nodes(); + pi->sockets_per_node = num_online_cpus() / + (pi->nr_nodes * pi->cores_per_socket * pi->threads_per_core); - pi->nr_nodes = 1; pi->total_pages = total_pages; pi->free_pages = avail_domheap_pages(); pi->scrub_pages = avail_scrub_pages(); @@ -51,6 +58,26 @@ long arch_do_sysctl( memset(pi->hw_cap, 0, sizeof(pi->hw_cap)); memcpy(pi->hw_cap, boot_cpu_data.x86_capability, NCAPINTS*4); ret = 0; + + /* fetch cpu_to_node pointer from guest */ + get_xen_guest_handle(map, sysctl->u.physinfo.cpu_to_node); + + /* if set, fill out cpu_to_node array */ + if ( map != NULL ) + { + /* for each cpu, mark in which node the cpu belongs */ + memset(cpu_to_node_map, 0, sizeof(cpu_to_node_map)); + for ( i = 0; i < num_online_cpus(); i++) + { + cpu_to_node_map[i]=cpu_to_node(i); + if ( copy_to_guest_offset(sysctl->u.physinfo.cpu_to_node, + i, &(cpu_to_node_map[i]), 1) ) { + ret = -EFAULT; + break; + } + } + } + if ( copy_to_guest(u_sysctl, sysctl, 1) ) ret = -EFAULT; } diff -r 400a3dca237e xen/include/public/sysctl.h --- a/xen/include/public/sysctl.h Mon Apr 09 12:05:26 2007 +0100 +++ b/xen/include/public/sysctl.h Fri Apr 13 13:04:24 2007 -0500 @@ -85,6 +85,7 @@ struct xen_sysctl_physinfo { uint64_aligned_t free_pages; uint64_aligned_t scrub_pages; uint32_t hw_cap[8]; + XEN_GUEST_HANDLE(uint32_t) cpu_to_node; }; typedef struct xen_sysctl_physinfo xen_sysctl_physinfo_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_physinfo_t); _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |