>From 033baca36963923c467adcb3d0473ea1f1e9b440 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 9 Jun 2017 12:22:24 -0400 Subject: [PATCH 6/7] xen-numa: Diagnostic tool to figure out NUMA issues. The tool can provide multiple views of a guest. - 'mfns' will dump all of the MFNs of a guest, useful for sorting and such and double-checking. - 'pfns' is an upgraded version of the above. It includes such details as what the PFN is within the guest. The list is not sorted. The PFNs are decimal, while the MFNs are hex to easy sorting. - 'node' digs in the PFNs and MFNs and figures out where they are - which PFNs belong to what NODE. This should match the guest view, otherwise we have issues. For example on Dom0 on SuperMicro H8DG6: sh-4.1# xen-numa node 0 -bash-4.1# /xen-numa node 0 NODE0 0 -> 0x1a8000 (6784 MB) NODE1 0x1a8000 -> 0x2a8000 (4096 MB) 0.0%..10.0%..20.0%..30.0%..40.0%..50.0%..60.0%..70.0%..80.0%..90.0%.. Max gpfn is 0x40069 (1024 MB) - NODE0 PFNs (33.173813%): 0x8352->0x8553 (514) 0x28554->0x2b995 (13378) 0x2b997->0x2d10c (6006) 0x2d10d->0x2d5a1 (1173) 0x2d5a3->0x38553 (44977) 0x3dc00->0x3e553 (2388) 0x3f554->0x3fd53 (2048) 0x3ff54->0x3ffd3 (128) 0x3fff4->0x3fffb (8) 0x39c00->0x3dbff (16384) 0x620a->0x620c (3) 0x61fe->0x6200 (3) 0x6215, 0x621b, 0x6221, 0x6249, 0x6231, 0x63a7, 0x635f, - NODE1 PFNs (66.771660%): 0x0->0x97 (152) 0x40000->0x40068 (105) 0x100->0x61fd (24830) 0x6201->0x6209 (9) 0x620d->0x6214 (8) 0x6216->0x621a (5) 0x621c->0x6220 (5) 0x6222->0x6230 (15) 0x6232->0x6248 (23) 0x624a->0x635e (277) 0x6360->0x63a6 (71) 0x63a8->0x8351 (8106) 0x8353, 0x8554->0x28553 (131072) 0x38554->0x39bff (5804) 0x3e554->0x3f553 (4096) 0x3fd54->0x3ff53 (512) 0x3ffd4->0x3fff3 (32) 0x3fffc->0x3ffff (4) Signed-off-by: Konrad Rzeszutek Wilk --- tools/misc/Makefile | 5 + tools/misc/xen-numa.c | 556 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 561 insertions(+) create mode 100644 tools/misc/xen-numa.c diff --git a/tools/misc/Makefile b/tools/misc/Makefile index 4cc7296..ea0bd9b 100644 --- a/tools/misc/Makefile +++ b/tools/misc/Makefile @@ -17,6 +17,7 @@ TARGETS-y += xen-insmod TARGETS-y += xen-rmmod TARGETS-y += xen-lsmod TARGETS-y += xen-attribute +TARGETS-y += xen-numa TARGETS := $(TARGETS-y) SUBDIRS := $(SUBDIRS-y) @@ -34,6 +35,7 @@ INSTALL_SBIN-y += xen-insmod INSTALL_SBIN-y += xen-rmmod INSTALL_SBIN-y += xen-lsmod INSTALL_SBIN-y += xen-attribute +INSTALL_SBIN-y += xen-numa INSTALL_SBIN := $(INSTALL_SBIN-y) INSTALL_PRIVBIN-y := xenpvnetboot @@ -100,6 +102,9 @@ xen-lsmod xen-rmmod xen-insmod: xen-%: xen-%.o xen-attribute: xen-%: xen-%.o $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS) +xen-numa: xen-numa.o + $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(LDLIBS_libxenguest) $(APPEND_LDFLAGS) + xen-lowmemd: xen-lowmemd.o $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenctrl) $(LDLIBS_libxenstore) $(APPEND_LDFLAGS) diff --git a/tools/misc/xen-numa.c b/tools/misc/xen-numa.c new file mode 100644 index 0000000..a0af262 --- /dev/null +++ b/tools/misc/xen-numa.c @@ -0,0 +1,556 @@ +/* + * Copyright (c) 2017 Oracle and/or its affiliates. All rights reserved. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#define LOGFILE stdout + +struct ops { + const char *name; + const char *help; + int (*setup)(struct ops *); + void (*free)(struct ops *); + void (*begin)(struct ops *); + int (*iterate)(struct ops *, unsigned long pfn, unsigned long mfn); + void (*end)(struct ops *); + + unsigned int arg3; + unsigned int arg4; + + unsigned long max_gpfn; + xen_pfn_t *live_m2p; + + struct xen_vmemrange *nodes; + unsigned int nodes_nr; + + void *priv; +}; + +static int iterate(xc_interface *xc_handle, + uint32_t domain, + struct ops *ops) +{ + int ret; + unsigned long hvirt_start; + unsigned int pt_levels; + uint64_t *buf = NULL; + unsigned long max_mfn = 0; /* max mfn of the whole machine */ + unsigned long m2p_mfn0; + unsigned int guest_width; + unsigned long i, start_pfn, version, max, old_v, max_gpfn; + + if ( domain > DOMID_FIRST_RESERVED ) + return -1; + + /* Get max gpfn */ + max_gpfn = do_memory_op(xc_handle, XENMEM_maximum_gpfn, &domain, + sizeof(domain)) + 1; + if ( max_gpfn <= 0 ) + { + fprintf(stderr, "Failed to get max_gpfn 0x%lx\n", max_gpfn); + return -EINVAL; + } + + ops->max_gpfn = max_gpfn; + if ( ops->begin ) + (ops->begin)(ops); + + /* Get max mfn */ + if ( !get_platform_info(xc_handle, domain, + &max_mfn, &hvirt_start, + &pt_levels, &guest_width) ) + { + fprintf(stderr, "Failed to get platform information\n"); + return -EINVAL; + } + + /* The max is GB(1) in pages. */ + max = 262144; + + ops->live_m2p = xc_map_m2p(xc_handle, max_mfn, PROT_READ, &m2p_mfn0); + if ( !ops->live_m2p ) + { + fprintf(stderr, "Failed to map live M2P table\n"); + return -EINVAL; + } + + /* Get guest's pfn list */ + buf = malloc(sizeof(uint64_t) * max); + if ( !buf ) + { + fprintf(stderr, "Failed to alloc pfn buf\n"); + munmap(ops->live_m2p, M2P_SIZE(max_mfn)); + return -EINVAL; + } + + start_pfn = 0; + old_v = version = 0; + do { + memset(buf, 0xFF, sizeof(uint64_t) * max); + ret = xc_get_pfn_list(xc_handle, domain, buf, start_pfn, max, &version); + if ( old_v != version ) + { + fprintf(stderr, "P2M changed, refetching.\n"); + start_pfn = 0; + old_v = version; + if ( ops->free ) + (ops->free)(ops); + if ( ops->begin ) + (ops->begin)(ops); + continue; + } + + if ( ret < 0 ) + { + fprintf(stderr, "Failed to call with start_pfn=0x%lx, max=0x%lx, ret %d\n", start_pfn, max, ret); + break; + } + if ( !ret ) + break; + + max = ret; /* Update it for the next iteration. */ + for ( i = 0; i < max; i++ ) + { + ret = (ops->iterate)(ops, i + start_pfn, buf[i]); + if ( ret ) + break; + } + + start_pfn += max; + if ( ret ) + break; + + } while ( start_pfn < max_gpfn ); + + free(buf); + if ( ops->end ) + (ops->end)(ops); + munmap(ops->live_m2p, M2P_SIZE(max_mfn)); + + return ret; +} + +/* ------------------------- */ +static int print_mfns(struct ops *ops, unsigned long pfn, unsigned long mfn) +{ + fprintf(stdout, "0x%lx\n", mfn); + return 0; +} + +static struct ops print_mfn_op = { + .help = " mfns - print all the MFNs of the guest", + .name = "mfns", + .iterate = print_mfns, +}; + +/* ------------------------- */ +static int print_pfn_and_mfns_header(struct ops *ops) +{ + fprintf(stdout,"PFN\tMFN\tNODE\n"); + fprintf(stdout,"--------------------------\n"); + + return 0; +} + +static int print_pfn_and_mfns(struct ops *ops, unsigned long pfn, unsigned long mfn) +{ + unsigned long m2p = ops->live_m2p[mfn]; + unsigned int i; + int nid = -1; + + for ( i = 0; i < ops->nodes_nr; i++ ) + { + if ( mfn >= ops->nodes[i].start && mfn < ops->nodes[i].end ) + { + nid = ops->nodes[i].nid; + break; + } + } + + fprintf(stdout, "%ld\t0x%lx\tNODE%d\n", m2p, mfn, nid); + return 0; +} + +static struct ops print_pfns_ops = { + .help = " pfns - print the MFNs and PFNs of the guest", + .name = "pfns", + .setup = print_pfn_and_mfns_header, + .iterate = print_pfn_and_mfns, +}; + +/* ------------------------- */ + +struct groups { + unsigned long start; + unsigned int len; + struct groups *next; +}; + +struct node_data { + int nid; + unsigned long pfns; + struct groups *groups; +}; + +struct node_args { + unsigned int stride; + struct node_data empty; + struct node_data *nodes_data; +}; + +static struct node_args *create_node(struct ops *ops) +{ + struct node_args *args; + unsigned int i; + struct node_data *n; + + args = malloc(sizeof(struct node_args)); + if ( !args ) + return NULL; + + args->stride = 262144; /* Every 1GB. */ + args->empty.nid = -1; + args->empty.groups = NULL; + args->empty.pfns = 0; + + n = malloc(sizeof(struct node_data) * ops->nodes_nr); + if ( !n ) + { + free(args); + fprintf(stderr, "Failed to initialize temp data.\n"); + return NULL; + } + args->nodes_data = n; + + for ( i = 0; i < ops->nodes_nr ; i++ ) + { + n[i].nid = ops->nodes[i].nid; + n[i].groups = NULL; + n[i].pfns = 0; + } + + return args; +} + +static int setup_node(struct ops *ops) +{ + struct node_args *args = create_node(ops); + + if ( !args ) + return -1; + + ops->priv = args; + return 0; +} + +static void begin_node(struct ops *ops) +{ + struct node_args *args = ops->priv; + unsigned int i; + + args->stride = ops->max_gpfn / 10; + + for ( i = 0; i < ops->nodes_nr ; i++ ) + { + fprintf(stdout, "NODE%d %#lx -> %#lx (%ld MB)\n", ops->nodes[i].nid, + ops->nodes[i].start, ops->nodes[i].end, + (ops->nodes[i].end - ops->nodes[i].start) >> 8); + } +} + +static struct groups *create(unsigned long pfn) +{ + struct groups *g; + + g = malloc(sizeof(*g)); + if ( !g ) + return NULL; + + g->next = NULL; + g->start = pfn; + g->len = 1; + + return g; +} + +static int add_to(struct node_data *n, unsigned long pfn) +{ + struct groups *g, *prev; + + if ( !n ) + return -1; + + if ( !n->groups ) + { + g = create(pfn); + if ( !g ) + return -ENOMEM; + n->groups = g; + } + + + for ( prev = NULL, g = n->groups; g; prev = g, g = g->next ) + { +#if DEBUG_NODE + fprintf(stderr, "%s[%d]: %ld -> %ld (%ld)\n", + __func__, n->nid, g->start, g->len+g->start, pfn); +#endif + if ( pfn >= g->start && pfn <= (g->start + g->len) ) + { + g->len++; + n->pfns++; + + return 0; + } + } + if ( !prev ) + return -EINVAL; + + if ( prev->next ) + return -EINVAL; + + prev->next = create(pfn); + if ( !prev->next ) + return -ENOMEM; + + return 0; +} + +static int _node_iterate(struct node_args *args, struct ops *ops, + unsigned long pfn, unsigned long mfn) +{ + unsigned int i; + + if ( !args ) + return -1; + + if ( !args->nodes_data ) + return -1; + + if ( args->stride && (pfn % args->stride) == 0 ) + { + fprintf(stdout, "%.1f%%..", ((float)pfn / ops->max_gpfn) * 100); + fflush(stdout); + } + if ( !mfn ) + return add_to(&args->empty, pfn); +#ifdef DEBUG_NODE + if ( pfn > 10 ) + return -1; +#endif + + pfn = ops->live_m2p[mfn]; + for ( i = 0; i < ops->nodes_nr; i++ ) + { + if ( mfn >= ops->nodes[i].start && mfn < ops->nodes[i].end ) + return add_to(&args->nodes_data[i], pfn); + } + + fprintf(stderr, "PFN 0x%lx, MFN 0x%lx is not within any NODE?!\n", pfn, mfn); + return -1; +} + +static int node_iterate(struct ops *ops, + unsigned long pfn, unsigned long mfn) +{ + return _node_iterate(ops->priv, ops, pfn, mfn); +} + +static void print_groups(struct node_data *n, unsigned long max_gpfn) +{ + struct groups *g; + float p = 0.0; + + if ( !n->groups ) + { + if ( n->nid >= 0 ) + fprintf(stdout, "- NODE%d not used.\n", n->nid); + return; + } + if ( n->pfns ) + { + p = (float)n->pfns / (float)max_gpfn; + p *= 100; + } + if ( n->nid >= 0 ) + fprintf(stdout, "- NODE%d PFNs (%lf%%):\n", n->nid, p); + else + fprintf(stdout, "PFNs not in any node (%lf%%):\n", p); + + for ( g = n->groups; g; g = g->next ) + { + if ( g->len == 1 ) + fprintf(stdout, "0x%lx, ", g->start); + else + fprintf(stdout, "0x%lx->0x%lx (%d)\n", g->start, g->start + g->len - 1, g->len); + } + fprintf(stdout, "\n"); +} + +static void free_groups(struct node_data *n) +{ + struct groups *g, *prev; + + if ( !n->groups ) + return; + + for ( prev = NULL, g = n->groups; g; prev = g, g = g->next ) + { + if ( prev ) + free( prev ); + } + + n->groups = NULL; +} + +static void node_free(struct ops *ops) +{ + struct node_args *args = ops->priv; + unsigned int i; + + if ( !args ) + return; + + for ( i = 0; i < ops->nodes_nr; i++ ) + free_groups(&args->nodes_data[i]); +} + +static void node_end(struct ops *ops) +{ + struct node_args *args = ops->priv; + unsigned int i; + + fprintf(stdout, "\nMax gpfn is 0x%lx (%ld MB)\n", + ops->max_gpfn, ops->max_gpfn >> 8); + + if ( !args ) + { + fprintf(stderr, "We lost our collected data!\n"); + return; + } + for ( i = 0; i < ops->nodes_nr; i++ ) + print_groups(&args->nodes_data[i], ops->max_gpfn); + + print_groups(&args->empty, ops->max_gpfn); + + node_free(ops); + free(args->nodes_data); + free(args); + ops->priv = NULL; +} + +static struct ops node_ops = { + .help = " node - summary of which PFNs are in which NODE.", + .name = "node", + .begin = begin_node, + .setup = setup_node, + .iterate = node_iterate, + .end = node_end, + .free = node_free, +}; + +static struct ops *callback_ops[] = { + &print_pfns_ops, + &print_mfn_op, + &print_pgm_ops, + &node_ops, +}; + +#define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0])) + +static int print_numa(xc_interface *xch, unsigned int mode, unsigned int domid, + unsigned int arg3, unsigned int arg4) +{ + struct xen_vmemrange *info; + int rc = 0; + struct ops *ops; + + rc = xc_list_numa(xch, &info); + if ( rc < 0 ) + { + fprintf(stderr, "Could not get the list of NUMA nodes: %s\n", + strerror(errno)); + return rc; + } + + if ( !info ) + { + printf("There is no NUMA?\n"); + return rc; + } + + ops = callback_ops[mode]; + ops->nodes_nr = rc; + ops->nodes = info; + ops->arg3 = arg3; + ops->arg4 = arg4; + + rc = 0; + if ( ops->setup ) + rc = (ops->setup)(ops); + + if ( !rc ) + rc = iterate(xch, domid, ops); + + if ( ops->free ) + (ops->free)(ops); + + free(info); + + return rc; +} + +static void show_usage(const char *const progname) +{ + unsigned int i; + fprintf(stderr, "%s [optional]\n", progname); + for ( i = 0; i < ARRAY_SIZE(callback_ops); i++ ) + fprintf(stderr, "%s\n", callback_ops[i]->help); +} + +int main(int argc, char **argv) +{ + xc_interface *xch = NULL; + unsigned int i; + + if ( argc < 3 ) + { + show_usage(argv[0]); + return -EINVAL; + } + + for ( i = 0; i < ARRAY_SIZE(callback_ops); i++ ) + { + if (!strncmp(callback_ops[i]->name, argv[1], strlen(argv[1]))) + break; + } + + if ( i != ARRAY_SIZE(callback_ops) ) + { + xch = xc_interface_open(0, 0, 0); + if ( !xch ) + { + fprintf(stderr, "Could not open Xen handler.\n"); + return -ENXIO; + } + + return print_numa(xch, i, atoi(argv[2]), + argc > 3 ? atoi(argv[3]) : 0, + argc > 4 ? atoi(argv[4]) : 0); + } + + return -EINVAL; +} -- 2.9.4