[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [RFC] pv guest numa [RE: Host Numa informtion in dom0]
Konrad, Thanks for the comments. I will make the changes and send over the patch. Any comments on the general approach are welcome too. -dulloor On Tue, Feb 16, 2010 at 12:49 PM, Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx> wrote: > Run this patch through scripts/checkpatch.pl > > On Sat, Feb 13, 2010 at 01:25:22AM -0500, Dulloor wrote: >> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c >> index 86bef0f..7a24070 100644 >> --- a/arch/x86/kernel/setup.c >> +++ b/arch/x86/kernel/setup.c >> @@ -940,7 +940,8 @@ void __init setup_arch(char **cmdline_p) >> /* >> * Parse SRAT to discover nodes. >> */ >> - acpi_numa_init(); >> + if (acpi_numa > 0) >> + acpi_numa_init(); > > Why? Why can't we just try acpi_numa_init()? > >> #endif >> >> initmem_init(0, max_pfn); >> diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c >> index 459913b..14fa654 100644 >> --- a/arch/x86/mm/numa_64.c >> +++ b/arch/x86/mm/numa_64.c >> @@ -11,7 +11,9 @@ >> #include <linux/ctype.h> >> #include <linux/module.h> >> #include <linux/nodemask.h> >> +#include <linux/cpumask.h> >> #include <linux/sched.h> >> +#include <xen/interface/xen.h> >> >> #include <asm/e820.h> >> #include <asm/proto.h> >> @@ -19,6 +21,7 @@ >> #include <asm/numa.h> >> #include <asm/acpi.h> >> #include <asm/k8.h> >> +#include <asm/xen/hypervisor.h> > > If one does not set CONFIG_XEN does this compile? >> >> struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; >> EXPORT_SYMBOL(node_data); >> @@ -428,7 +431,6 @@ static int __init numa_emulation(unsigned long >> start_pfn, unsigned long last_pfn >> */ >> if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { >> long n = simple_strtol(cmdline, NULL, 0); >> - > > No need for this. >> num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); >> if (num_nodes < 0) >> return num_nodes; >> @@ -522,16 +524,162 @@ out: >> numa_init_array(); >> return 0; >> } >> +struct xen_domain_numa_layout pv_numa_layout; >> + >> +void dump_numa_layout(struct xen_domain_numa_layout *layout) >> +{ >> + unsigned int i, j; >> + char vcpumask[128]; >> + printk("NUMA-LAYOUT(Dom0) : vcpus(%u), vnodes(%u)\n", >> + layout->max_vcpus, layout->max_vnodes); > > Redo the printk's. They need KERN_DEBUG >> + for (i = 0; i < layout->max_vnodes; i++) >> + { >> + struct xen_vnode_data *vnode_data = &layout->vnode_data[i]; >> + cpumask_scnprintf(vcpumask, sizeof(vcpumask), >> + ((cpumask_t *)&vnode_data->vcpu_mask)); >> + printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n", >> + vnode_data->vnode_id, vnode_data->mnode_id, >> + (unsigned long)vnode_data->nr_pages, vcpumask); > > This one too. >> + } >> + >> + printk("vnode distances :\n"); > > and >> + for (i = 0; i < layout->max_vnodes; i++) >> + printk("\tvnode[%u]", i); >> + for (i = 0; i < layout->max_vnodes; i++) >> + { >> + printk("\nvnode[%u]", i); > > this >> + for (j = 0; j < layout->max_vnodes; j++) >> + printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + >> j]); >> + printk("\n"); > one to. >> + } >> + return; >> +} >> + >> +static void __init xen_init_slit_table(struct xen_domain_numa_layout >> *layout) >> +{ >> + /* Construct a slit table (using layout->vnode_distance). >> + * Copy it to acpi_slit. */ >> + return; >> +} >> +/* Distribute the vcpus over the vnodes according to their affinity */ >> +static void __init xen_init_numa_array(struct xen_domain_numa_layout >> *layout) >> +{ >> + int vcpu, vnode; >> + >> + printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n"); > > pr_debug instead? >> + >> + for (vnode = 0; vnode < layout->max_vnodes; vnode++) >> + { >> + struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode]; >> + cpumask_t vcpu_mask = *((cpumask_t *)&vnode_data->vcpu_mask); >> + >> + for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++) >> + { >> + if (cpu_isset(vcpu, vcpu_mask)) >> + { >> + if (early_cpu_to_node(vcpu) != NUMA_NO_NODE) >> + { >> + printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n", >> + vcpu, early_cpu_to_node(vcpu)); >> + continue; >> + } >> + printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode); >> + numa_set_node(vcpu, vnode); >> + } >> + } >> + } >> + return; >> +} >> + >> +static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout, >> + unsigned long start_pfn, unsigned long last_pfn) >> +{ >> + int num_vnodes, i; >> + u64 node_start_addr, node_end_addr, max_addr; >> + >> + printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)", >> + layout->max_vnodes, >> layout->max_vcpus); >> + dump_numa_layout(layout); >> + memset(&nodes, 0, sizeof(nodes)); >> + >> + num_vnodes = layout->max_vnodes; >> + BUG_ON(num_vnodes > MAX_NUMNODES); > > Hmm.. Is that really neccessary? What if we just do WARN("some lengthy > explanation"), > and bail out and not initialize these structures? >> + >> + max_addr = last_pfn << PAGE_SHIFT; >> + >> + node_start_addr = start_pfn << PAGE_SHIFT; >> + for (i = 0; i < num_vnodes; i++) >> + { >> + struct xen_vnode_data *vnode_data = &layout->vnode_data[i]; >> + u64 node_size = vnode_data->nr_pages << PAGE_SHIFT; >> + >> + node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */ >> + >> + if (i == (num_vnodes-1)) >> + node_end_addr = max_addr; >> + else >> + { >> + node_end_addr = node_start_addr + node_size; >> + while ((node_end_addr - node_start_addr - >> + e820_hole_size(node_start_addr, node_end_addr)) < node_size) >> + { >> + node_end_addr += FAKE_NODE_MIN_SIZE; >> + if (node_end_addr > max_addr) { >> + node_end_addr = max_addr; >> + break; >> + } >> + } >> + } >> + /* node_start_addr updated inside the function */ >> + if (setup_node_range(i, nodes, &node_start_addr, >> + (node_end_addr-node_start_addr), max_addr+1)) >> + BUG(); >> + } >> + >> + printk(KERN_INFO "XEN domain numa emulation - setup nodes\n"); > > Is this neccessary? Can be it be debug? >> + >> + memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL); >> + if (memnode_shift < 0) { >> + printk(KERN_ERR "No NUMA hash function found.\n"); >> + BUG(); > > Wow. BUG? What about just bailing out and unset xen_pv_emulation flag? > >> + } >> + /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! >> */ >> + /* >> + * We need to vacate all active ranges that may have been registered by >> + * SRAT and set acpi_numa to -1 so that srat_disabled() always returns >> + * true. NUMA emulation has succeeded so we will not scan ACPI nodes. >> + */ >> + remove_all_active_ranges(); >> + >> + BUG_ON(acpi_numa >= 0); >> + for_each_node_mask(i, node_possible_map) { >> + e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, >> + nodes[i].end >> PAGE_SHIFT); >> + setup_node_bootmem(i, nodes[i].start, nodes[i].end); >> + } >> + xen_init_slit_table(layout); >> + xen_init_numa_array(layout); >> + return 0; >> +} >> #endif /* CONFIG_NUMA_EMU */ >> >> void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) >> { >> int i; >> + struct xen_domain_numa_layout *numa_layout = &pv_numa_layout; >> + >> + int xen_pv_numa_enabled = numa_layout->max_vnodes; >> >> nodes_clear(node_possible_map); >> nodes_clear(node_online_map); >> >> #ifdef CONFIG_NUMA_EMU >> + if (xen_pv_domain() && xen_pv_numa_enabled) >> + { >> + if (!xen_numa_emulation(numa_layout, start_pfn, last_pfn)) >> + return; >> + } >> + >> if (cmdline && !numa_emulation(start_pfn, last_pfn)) >> return; >> nodes_clear(node_possible_map); >> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c >> index ecb9b0d..b020555 100644 >> --- a/arch/x86/xen/enlighten.c >> +++ b/arch/x86/xen/enlighten.c >> @@ -83,6 +83,8 @@ void *xen_initial_gdt; >> */ >> struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; >> >> +struct xen_domain_numa_layout *HYPERVISOR_domain_numa_layout; >> + >> /* >> * Flag to determine whether vcpu info placement is available on all >> * VCPUs. We assume it is to start with, and then set it to zero on >> @@ -1089,6 +1091,7 @@ static void __init xen_setup_stackprotector(void) >> pv_cpu_ops.load_gdt = xen_load_gdt; >> } >> >> +extern struct xen_domain_numa_layout pv_numa_layout; >> /* First C function to be called on Xen boot */ >> asmlinkage void __init xen_start_kernel(void) >> { >> @@ -1230,6 +1233,12 @@ asmlinkage void __init xen_start_kernel(void) >> xen_start_info->console.domU.evtchn = 0; >> } >> >> + { >> + struct xen_domain_numa_layout *layout = >> + (void *)((char *)xen_start_info + >> + xen_start_info->numa_layout_info.info_off); >> + memcpy(&pv_numa_layout, layout, sizeof(*layout)); >> + } > > Shouldn't there be a check to see if the Xen actually exports > this structure? Otherwise you might copy garbage in and set > pv_numa_layout values to random values. > >> xen_raw_console_write("about to get started...\n"); >> >> /* Start the world */ >> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c >> index 612f2c9..cb944a2 100644 >> --- a/arch/x86/xen/setup.c >> +++ b/arch/x86/xen/setup.c >> @@ -281,6 +281,9 @@ void __init xen_arch_setup(void) >> printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); >> disable_acpi(); >> } >> + >> + acpi_numa = -1; >> + numa_off = 1; >> #endif >> >> /* >> diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h >> index 812ffd5..d4588fa 100644 >> --- a/include/xen/interface/xen.h >> +++ b/include/xen/interface/xen.h >> @@ -398,6 +398,53 @@ struct shared_info { >> >> }; >> >> +#define XEN_NR_CPUS 64 > No way to share this with some other variable? Like NR_CPUS? > >> +#if defined(__i386__) >> +#define XEN_BITS_PER_LONG 32 >> +#define XEN_BYTES_PER_LONG 4 > > There gotta be some of these defined in other headers. >> +#define XEN_LONG_BYTEORDER 2 >> +#elif defined(__x86_64__) >> +#define XEN_BITS_PER_LONG 64 >> +#define XEN_BYTES_PER_LONG 8 >> +#define XEN_LONG_BYTEORDER 3 >> +#endif >> + >> +/* same as cpumask_t - in xen and even Linux (for now) */ >> +#define XEN_BITS_TO_LONGS(bits) \ >> + (((bits)+XEN_BITS_PER_LONG-1)/XEN_BITS_PER_LONG) >> +#define XEN_DECLARE_BITMAP(name,bits) \ >> + unsigned long name[XEN_BITS_TO_LONGS(bits)] > > I am pretty sure there are macros for this laready. >> +struct xen_cpumask{ XEN_DECLARE_BITMAP(bits, XEN_NR_CPUS); }; >> +#ifndef __XEN__ > > Is that neccessary? typedefs are frowned upon in kernel. >> +typedef struct xen_cpumask xen_cpumask_t; >> +#endif >> + >> +#define XEN_MAX_VNODES 8 >> +struct xen_vnode_data { >> + uint32_t vnode_id; >> + uint32_t mnode_id; >> + uint64_t nr_pages; >> + /* XXX: Can we use this in xen<->domain interfaces ? */ >> + struct xen_cpumask vcpu_mask; /* vnode_to_vcpumask */ >> +}; >> +#ifndef __XEN__ >> +typedef struct xen_vnode_data xen_vnode_data_t; > > Ditto. >> +#endif >> + >> +/* NUMA layout for the domain at the time of startup. >> + * Structure has to fit within a page. */ >> +struct xen_domain_numa_layout { >> + uint32_t max_vcpus; >> + uint32_t max_vnodes; >> + >> + /* Only (max_vnodes*max_vnodes) entries are filled */ >> + uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES]; >> + struct xen_vnode_data vnode_data[XEN_MAX_VNODES]; >> +}; >> +#ifndef __XEN__ >> +typedef struct xen_domain_numa_layout xen_domain_numa_layout_t; > > Ditto. >> +#endif >> + >> /* >> * Start-of-day memory layout for the initial domain (DOM0): >> * 1. The domain is started within contiguous virtual-memory region. >> @@ -449,6 +496,13 @@ struct start_info { >> unsigned long mod_start; /* VIRTUAL address of pre-loaded module. >> */ >> unsigned long mod_len; /* Size (bytes) of pre-loaded module. >> */ >> int8_t cmd_line[MAX_GUEST_CMDLINE]; >> + /* The pfn range here covers both page table and p->m table frames. */ >> + unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table. */ > > Why is this added in this patch? That doesn't look to be used by your > patch. >> + unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table. */ > > Ditto. >> + struct { >> + uint32_t info_off; /* Offset of console_info struct. * >> + uint32_t info_size; /* Size of console_info struct from start.*/ > > Wrong comment. >> + } numa_layout_info; > >> }; >> >> struct dom0_vga_console_info { > > _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |