Xen project Mailing List

Re: [Xen-devel] [RFC] pv guest numa [RE: Host Numa informtion in dom0]

From: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>

Date: Tue, 16 Feb 2010 12:49:00 -0500

Cc: Andre Przywara <andre.przywara@xxxxxxx>, Ian Pratt <Ian.Pratt@xxxxxxxxxxxxx>, "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>, Keir Fraser <keir.fraser@xxxxxxxxxxxxx>, "Nakajima, Jun" <jun.nakajima@xxxxxxxxx>

Delivery-date: Tue, 16 Feb 2010 10:04:04 -0800

List-id: Xen developer discussion <xen-devel.lists.xensource.com>

Run this patch through scripts/checkpatch.pl On Sat, Feb 13, 2010 at 01:25:22AM -0500, Dulloor wrote: > diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c > index 86bef0f..7a24070 100644 > --- a/arch/x86/kernel/setup.c > +++ b/arch/x86/kernel/setup.c > @@ -940,7 +940,8 @@ void __init setup_arch(char **cmdline_p) > /* > * Parse SRAT to discover nodes. > */ > - acpi_numa_init(); > + if (acpi_numa > 0) > + acpi_numa_init(); Why? Why can't we just try acpi_numa_init()? > #endif > > initmem_init(0, max_pfn); > diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c > index 459913b..14fa654 100644 > --- a/arch/x86/mm/numa_64.c > +++ b/arch/x86/mm/numa_64.c > @@ -11,7 +11,9 @@ > #include <linux/ctype.h> > #include <linux/module.h> > #include <linux/nodemask.h> > +#include <linux/cpumask.h> > #include <linux/sched.h> > +#include <xen/interface/xen.h> > > #include <asm/e820.h> > #include <asm/proto.h> > @@ -19,6 +21,7 @@ > #include <asm/numa.h> > #include <asm/acpi.h> > #include <asm/k8.h> > +#include <asm/xen/hypervisor.h> If one does not set CONFIG_XEN does this compile? > > struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; > EXPORT_SYMBOL(node_data); > @@ -428,7 +431,6 @@ static int __init numa_emulation(unsigned long start_pfn, > unsigned long last_pfn > */ > if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { > long n = simple_strtol(cmdline, NULL, 0); > - No need for this. > num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); > if (num_nodes < 0) > return num_nodes; > @@ -522,16 +524,162 @@ out: > numa_init_array(); > return 0; > } > +struct xen_domain_numa_layout pv_numa_layout; > + > +void dump_numa_layout(struct xen_domain_numa_layout *layout) > +{ > + unsigned int i, j; > + char vcpumask[128]; > + printk("NUMA-LAYOUT(Dom0) : vcpus(%u), vnodes(%u)\n", > + layout->max_vcpus, layout->max_vnodes); Redo the printk's. They need KERN_DEBUG > + for (i = 0; i < layout->max_vnodes; i++) > + { > + struct xen_vnode_data *vnode_data = &layout->vnode_data[i]; > + cpumask_scnprintf(vcpumask, sizeof(vcpumask), > + ((cpumask_t *)&vnode_data->vcpu_mask)); > + printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n", > + vnode_data->vnode_id, vnode_data->mnode_id, > + (unsigned long)vnode_data->nr_pages, vcpumask); This one too. > + } > + > + printk("vnode distances :\n"); and > + for (i = 0; i < layout->max_vnodes; i++) > + printk("\tvnode[%u]", i); > + for (i = 0; i < layout->max_vnodes; i++) > + { > + printk("\nvnode[%u]", i); this > + for (j = 0; j < layout->max_vnodes; j++) > + printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + j]); > + printk("\n"); one to. > + } > + return; > +} > + > +static void __init xen_init_slit_table(struct xen_domain_numa_layout *layout) > +{ > + /* Construct a slit table (using layout->vnode_distance). > + * Copy it to acpi_slit. */ > + return; > +} > +/* Distribute the vcpus over the vnodes according to their affinity */ > +static void __init xen_init_numa_array(struct xen_domain_numa_layout *layout) > +{ > + int vcpu, vnode; > + > + printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n"); pr_debug instead? > + > + for (vnode = 0; vnode < layout->max_vnodes; vnode++) > + { > + struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode]; > + cpumask_t vcpu_mask = *((cpumask_t *)&vnode_data->vcpu_mask); > + > + for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++) > + { > + if (cpu_isset(vcpu, vcpu_mask)) > + { > + if (early_cpu_to_node(vcpu) != NUMA_NO_NODE) > + { > + printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n", > + vcpu, early_cpu_to_node(vcpu)); > + continue; > + } > + printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode); > + numa_set_node(vcpu, vnode); > + } > + } > + } > + return; > +} > + > +static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout, > + unsigned long start_pfn, unsigned long last_pfn) > +{ > + int num_vnodes, i; > + u64 node_start_addr, node_end_addr, max_addr; > + > + printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)", > + layout->max_vnodes, > layout->max_vcpus); > + dump_numa_layout(layout); > + memset(&nodes, 0, sizeof(nodes)); > + > + num_vnodes = layout->max_vnodes; > + BUG_ON(num_vnodes > MAX_NUMNODES); Hmm.. Is that really neccessary? What if we just do WARN("some lengthy explanation"), and bail out and not initialize these structures? > + > + max_addr = last_pfn << PAGE_SHIFT; > + > + node_start_addr = start_pfn << PAGE_SHIFT; > + for (i = 0; i < num_vnodes; i++) > + { > + struct xen_vnode_data *vnode_data = &layout->vnode_data[i]; > + u64 node_size = vnode_data->nr_pages << PAGE_SHIFT; > + > + node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */ > + > + if (i == (num_vnodes-1)) > + node_end_addr = max_addr; > + else > + { > + node_end_addr = node_start_addr + node_size; > + while ((node_end_addr - node_start_addr - > + e820_hole_size(node_start_addr, node_end_addr)) < node_size) > + { > + node_end_addr += FAKE_NODE_MIN_SIZE; > + if (node_end_addr > max_addr) { > + node_end_addr = max_addr; > + break; > + } > + } > + } > + /* node_start_addr updated inside the function */ > + if (setup_node_range(i, nodes, &node_start_addr, > + (node_end_addr-node_start_addr), max_addr+1)) > + BUG(); > + } > + > + printk(KERN_INFO "XEN domain numa emulation - setup nodes\n"); Is this neccessary? Can be it be debug? > + > + memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL); > + if (memnode_shift < 0) { > + printk(KERN_ERR "No NUMA hash function found.\n"); > + BUG(); Wow. BUG? What about just bailing out and unset xen_pv_emulation flag? > + } > + /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! */ > + /* > + * We need to vacate all active ranges that may have been registered by > + * SRAT and set acpi_numa to -1 so that srat_disabled() always returns > + * true. NUMA emulation has succeeded so we will not scan ACPI nodes. > + */ > + remove_all_active_ranges(); > + > + BUG_ON(acpi_numa >= 0); > + for_each_node_mask(i, node_possible_map) { > + e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, > + nodes[i].end >> PAGE_SHIFT); > + setup_node_bootmem(i, nodes[i].start, nodes[i].end); > + } > + xen_init_slit_table(layout); > + xen_init_numa_array(layout); > + return 0; > +} > #endif /* CONFIG_NUMA_EMU */ > > void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) > { > int i; > + struct xen_domain_numa_layout *numa_layout = &pv_numa_layout; > + > + int xen_pv_numa_enabled = numa_layout->max_vnodes; > > nodes_clear(node_possible_map); > nodes_clear(node_online_map); > > #ifdef CONFIG_NUMA_EMU > + if (xen_pv_domain() && xen_pv_numa_enabled) > + { > + if (!xen_numa_emulation(numa_layout, start_pfn, last_pfn)) > + return; > + } > + > if (cmdline && !numa_emulation(start_pfn, last_pfn)) > return; > nodes_clear(node_possible_map); > diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c > index ecb9b0d..b020555 100644 > --- a/arch/x86/xen/enlighten.c > +++ b/arch/x86/xen/enlighten.c > @@ -83,6 +83,8 @@ void *xen_initial_gdt; > */ > struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; > > +struct xen_domain_numa_layout *HYPERVISOR_domain_numa_layout; > + > /* > * Flag to determine whether vcpu info placement is available on all > * VCPUs. We assume it is to start with, and then set it to zero on > @@ -1089,6 +1091,7 @@ static void __init xen_setup_stackprotector(void) > pv_cpu_ops.load_gdt = xen_load_gdt; > } > > +extern struct xen_domain_numa_layout pv_numa_layout; > /* First C function to be called on Xen boot */ > asmlinkage void __init xen_start_kernel(void) > { > @@ -1230,6 +1233,12 @@ asmlinkage void __init xen_start_kernel(void) > xen_start_info->console.domU.evtchn = 0; > } > > + { > + struct xen_domain_numa_layout *layout = > + (void *)((char *)xen_start_info + > + xen_start_info->numa_layout_info.info_off); > + memcpy(&pv_numa_layout, layout, sizeof(*layout)); > + } Shouldn't there be a check to see if the Xen actually exports this structure? Otherwise you might copy garbage in and set pv_numa_layout values to random values. > xen_raw_console_write("about to get started...\n"); > > /* Start the world */ > diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c > index 612f2c9..cb944a2 100644 > --- a/arch/x86/xen/setup.c > +++ b/arch/x86/xen/setup.c > @@ -281,6 +281,9 @@ void __init xen_arch_setup(void) > printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); > disable_acpi(); > } > + > + acpi_numa = -1; > + numa_off = 1; > #endif > > /* > diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h > index 812ffd5..d4588fa 100644 > --- a/include/xen/interface/xen.h > +++ b/include/xen/interface/xen.h > @@ -398,6 +398,53 @@ struct shared_info { > > }; > > +#define XEN_NR_CPUS 64 No way to share this with some other variable? Like NR_CPUS? > +#if defined(__i386__) > +#define XEN_BITS_PER_LONG 32 > +#define XEN_BYTES_PER_LONG 4 There gotta be some of these defined in other headers. > +#define XEN_LONG_BYTEORDER 2 > +#elif defined(__x86_64__) > +#define XEN_BITS_PER_LONG 64 > +#define XEN_BYTES_PER_LONG 8 > +#define XEN_LONG_BYTEORDER 3 > +#endif > + > +/* same as cpumask_t - in xen and even Linux (for now) */ > +#define XEN_BITS_TO_LONGS(bits) \ > + (((bits)+XEN_BITS_PER_LONG-1)/XEN_BITS_PER_LONG) > +#define XEN_DECLARE_BITMAP(name,bits) \ > + unsigned long name[XEN_BITS_TO_LONGS(bits)] I am pretty sure there are macros for this laready. > +struct xen_cpumask{ XEN_DECLARE_BITMAP(bits, XEN_NR_CPUS); }; > +#ifndef __XEN__ Is that neccessary? typedefs are frowned upon in kernel. > +typedef struct xen_cpumask xen_cpumask_t; > +#endif > + > +#define XEN_MAX_VNODES 8 > +struct xen_vnode_data { > + uint32_t vnode_id; > + uint32_t mnode_id; > + uint64_t nr_pages; > + /* XXX: Can we use this in xen<->domain interfaces ? */ > + struct xen_cpumask vcpu_mask; /* vnode_to_vcpumask */ > +}; > +#ifndef __XEN__ > +typedef struct xen_vnode_data xen_vnode_data_t; Ditto. > +#endif > + > +/* NUMA layout for the domain at the time of startup. > + * Structure has to fit within a page. */ > +struct xen_domain_numa_layout { > + uint32_t max_vcpus; > + uint32_t max_vnodes; > + > + /* Only (max_vnodes*max_vnodes) entries are filled */ > + uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES]; > + struct xen_vnode_data vnode_data[XEN_MAX_VNODES]; > +}; > +#ifndef __XEN__ > +typedef struct xen_domain_numa_layout xen_domain_numa_layout_t; Ditto. > +#endif > + > /* > * Start-of-day memory layout for the initial domain (DOM0): > * 1. The domain is started within contiguous virtual-memory region. > @@ -449,6 +496,13 @@ struct start_info { > unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */ > unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ > int8_t cmd_line[MAX_GUEST_CMDLINE]; > + /* The pfn range here covers both page table and p->m table frames. */ > + unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table. */ Why is this added in this patch? That doesn't look to be used by your patch. > + unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table. */ Ditto. > + struct { > + uint32_t info_off; /* Offset of console_info struct. * > + uint32_t info_size; /* Size of console_info struct from start.*/ Wrong comment. > + } numa_layout_info; > }; > > struct dom0_vga_console_info { _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.