[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH 10/11] [PVOPS] Use enlightenment to setup nodes
> It looks as the comments I provided in > <20100216174900.GB21067@xxxxxxxxxxxxxxxxxxx> were ignored. I incorporated a number of your comments, but obviously missed some. Sorry, wasn't intentional. Comments inline. On Mon, Apr 5, 2010 at 10:16 AM, Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx> wrote: > > It looks as the comments I provided in > <20100216174900.GB21067@xxxxxxxxxxxxxxxxxxx> were ignored. > > http://article.gmane.org/gmane.comp.emulators.xen.devel/78118 > > Here I am repeating some of them and adding some new ones. > Please address/answer them. > > .. snip.. > >> - acpi_numa_init(); >> + if (acpi_numa > 0) >> + acpi_numa_init(); > > The style guide is to use tabs, not spaces. It looks out-off-place? I had set the darn "expandtab" set in my vimrc. I have run the patch through checkpatch.pl and will fix the indentation issues. > Why not just do acpi_numa_init() by itself? > With pv kernels, we know well ahead that numa acpi is irrelevant, same as when booting linux with noacpi. I thought it is unnecessary to even parse the numa acpi tables in that case. > >> #endif >> >> initmem_init(0, max_pfn); >> diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c >> index 459913b..3a856dc 100644 >> --- a/arch/x86/mm/numa_64.c >> +++ b/arch/x86/mm/numa_64.c >> @@ -11,7 +11,11 @@ >> #include <linux/ctype.h> >> #include <linux/module.h> >> #include <linux/nodemask.h> >> +#include <linux/cpumask.h> >> #include <linux/sched.h> >> +#include <linux/bitops.h> >> +#include <xen/interface/xen.h> >> +#include <xen/interface/memory.h> >> >> #include <asm/e820.h> >> #include <asm/proto.h> >> @@ -19,6 +23,8 @@ >> #include <asm/numa.h> >> #include <asm/acpi.h> >> #include <asm/k8.h> >> +#include <asm/xen/hypervisor.h> >> +#include <asm/xen/hypercall.h> >> >> struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; >> EXPORT_SYMBOL(node_data); >> @@ -428,7 +434,6 @@ static int __init numa_emulation(unsigned long >> start_pfn, unsigned long last_pfn >> */ >> if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { >> long n = simple_strtol(cmdline, NULL, 0); >> - > > Uhh, why? By mistake ! :) >> num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); >> if (num_nodes < 0) >> return num_nodes; >> @@ -522,6 +527,246 @@ out: >> numa_init_array(); >> return 0; >> } >> + >> +/************************************************************************/ >> +#ifdef CONFIG_XEN_NUMA_GUEST > > Yikes. Can you move all of this code in it is own file so that there is > no need to sprinkle this with #ifdefs? I agree with you. I will also take care of the few xen header files I have added to the file. > >> +/* XEN PV GUEST NUMA */ >> +struct xen_domain_numa_layout HYPERVISOR_pv_numa_layout; >> + >> +static inline void __init >> +bitmap_byte_to_long(unsigned long *lp, const uint8_t *bp, int nbits) >> +{ >> + /* We may need to pad the final longword with zeroes. */ >> + if (nbits & (BITS_PER_LONG-1)) >> + lp[BITS_TO_LONGS(nbits)-1] = 0; >> + memcpy(lp, bp, (nbits+7)/8); >> +} >> + >> +static void __init >> +xenctl_cpumask_to_cpumask(cpumask_t *cpumask, struct xenctl_cpumask >> *xcpumask) >> +{ >> + unsigned int nr_cpus; >> + uint8_t *bytemap; >> + >> + bytemap = xcpumask->bits; >> + >> + nr_cpus = >> + min_t(unsigned int, XENCTL_NR_CPUS, NR_CPUS); >> + >> + cpumask_clear(cpumask); >> + bitmap_byte_to_long(cpus_addr(*cpumask), bytemap, nr_cpus); >> +} >> + >> +static void __init >> +xen_dump_numa_layout(struct xen_domain_numa_layout *layout) >> +{ >> + unsigned int i, j; >> + char vcpumaskstr[128]; >> + printk("NUMA-LAYOUT : vcpus(%u), vnodes(%u), ", >> + layout->max_vcpus, layout->max_vnodes); > > No good. You need printk(KERN_DEBUG .. Missed these from your previous comments. Will take care of these. >> + switch (layout->type) >> + { >> + case XEN_DOM_NUMA_CONFINED: >> + printk("type(CONFINED)\n"); > ditoo. >> + break; >> + case XEN_DOM_NUMA_SPLIT: >> + printk("type(SPLIT)\n"); > ditto. >> + break; >> + case XEN_DOM_NUMA_STRIPED: >> + printk("type(STRIPED)\n"); > ditto. >> + break; >> + default: >> + printk("type(UNDEFINED)\n"); > > ditto. >> + } >> + >> + for (i = 0; i < layout->max_vnodes; i++) >> + { >> + cpumask_t vcpu_mask; >> + struct xenctl_cpumask *xvcpumask; >> + struct xen_vnode_data *vnode_data = &layout->vnode_data[i]; >> + xvcpumask = &vnode_data->vcpu_mask; >> + xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpumask); >> + cpumask_scnprintf(vcpumaskstr, sizeof(vcpumaskstr), &vcpu_mask); >> + printk("vnode[%u]:mnode(%u), node_nr_pages(%lx), vcpu_mask(%s)\n", > ditto >> + vnode_data->vnode_id, vnode_data->mnode_id, >> + (unsigned long)vnode_data->nr_pages, vcpumaskstr); >> + } >> + >> + printk("vnode distances :\n"); > ditoo >> + for (i = 0; i < layout->max_vnodes; i++) >> + printk("\tvnode[%u]", i); > ditto >> + for (i = 0; i < layout->max_vnodes; i++) >> + { >> + printk("\nvnode[%u]", i); > ditoo >> + for (j = 0; j < layout->max_vnodes; j++) >> + printk("\t%u", layout->vnode_distance[i*layout->max_vnodes + >> j]); >> + printk("\n"); > ditto >> + } >> + return; >> +} >> + >> +static void __init xen_init_slit_table(struct xen_domain_numa_layout >> *layout) >> +{ >> + /* Construct a slit table (using layout->vnode_distance). >> + * Copy it to acpi_slit. */ >> + return; >> +} >> + >> +/* Distribute the vcpus over the vnodes according to their affinity */ >> +static void __init xen_init_numa_array(struct xen_domain_numa_layout >> *layout) >> +{ >> + int vcpu, vnode; >> + >> + printk(KERN_INFO "xen_numa_init_array - cpu_to_node initialization\n"); >> + >> + >> + for (vnode = 0; vnode < layout->max_vnodes; vnode++) >> + { >> + struct xen_vnode_data *vnode_data = &layout->vnode_data[vnode]; >> + struct xenctl_cpumask *xvcpu_mask = &vnode_data->vcpu_mask; >> + cpumask_t vcpu_mask; >> + >> + xenctl_cpumask_to_cpumask(&vcpu_mask, xvcpu_mask); >> + >> + for (vcpu = 0; vcpu < layout->max_vcpus; vcpu++) >> + { >> + if (cpu_isset(vcpu, vcpu_mask)) >> + { >> + if (early_cpu_to_node(vcpu) != NUMA_NO_NODE) >> + { >> + printk(KERN_INFO "EARLY vcpu[%d] on vnode[%d]\n", >> + vcpu, early_cpu_to_node(vcpu)); >> + continue; >> + } >> + printk(KERN_INFO "vcpu[%d] on vnode[%d]\n", vcpu, vnode); >> + numa_set_node(vcpu, vnode); >> + } >> + } >> + } >> + return; >> +} >> + >> +static int __init xen_numa_emulation(struct xen_domain_numa_layout *layout, >> + unsigned long start_pfn, unsigned long last_pfn) >> +{ >> + int num_vnodes, i; >> + u64 node_start_addr, node_end_addr, max_addr; >> + >> + printk(KERN_INFO "xen_numa_emulation : max_vnodes(%d), max_vcpus(%d)", >> + layout->max_vnodes, >> layout->max_vcpus); >> + >> + if (layout->type != XEN_DOM_NUMA_SPLIT) >> + { >> + printk(KERN_INFO "xen_numa_emulation : Invalid layout type"); >> + return -1; >> + } >> + >> + memset(&nodes, 0, sizeof(nodes)); >> + >> + num_vnodes = layout->max_vnodes; >> + BUG_ON(num_vnodes > MAX_NUMNODES); >> + >> + max_addr = last_pfn << PAGE_SHIFT; >> + >> + node_start_addr = start_pfn << PAGE_SHIFT; >> + for (i = 0; i < num_vnodes; i++) >> + { >> + struct xen_vnode_data *vnode_data = &layout->vnode_data[i]; >> + u64 node_size = vnode_data->nr_pages << PAGE_SHIFT; >> + >> + node_size &= FAKE_NODE_MIN_HASH_MASK; /* 64MB aligned */ >> + >> + if (i == (num_vnodes-1)) >> + node_end_addr = max_addr; >> + else >> + node_end_addr = node_start_addr + node_size; >> + /* node_start_addr updated inside the function */ >> + if (setup_node_range(i, nodes, &node_start_addr, >> + (node_end_addr-node_start_addr), max_addr+1)) >> + goto failed; >> + } >> + >> + printk(KERN_INFO "XEN domain numa emulation - setup nodes\n"); >> + >> + memnode_shift = compute_hash_shift(nodes, num_vnodes, NULL); >> + if (memnode_shift < 0) { >> + printk(KERN_ERR "No NUMA hash function found.\n"); >> + goto failed; >> + } >> + /* XXX: Shouldn't be needed because we disabled acpi_numa very early ! >> */ >> + /* >> + * We need to vacate all active ranges that may have been registered by >> + * SRAT and set acpi_numa to -1 so that srat_disabled() always returns >> + * true. NUMA emulation has succeeded so we will not scan ACPI nodes. >> + */ >> + remove_all_active_ranges(); >> + >> + BUG_ON(acpi_numa >= 0); >> + for_each_node_mask(i, node_possible_map) { >> + e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, >> + nodes[i].end >> PAGE_SHIFT); >> + setup_node_bootmem(i, nodes[i].start, nodes[i].end); >> + } >> + xen_init_slit_table(layout); >> + xen_init_numa_array(layout); >> + return 0; >> +failed: >> + return -1; >> +} >> + >> +static int __init >> +xen_get_domain_numa_layout(struct xen_domain_numa_layout *pv_layout) >> +{ >> + int rc; >> + struct xenmem_numa_op memop; >> + memop.cmd = XENMEM_get_domain_numa_layout; >> + memop.u.dinfo.domid = DOMID_SELF; >> + memop.u.dinfo.version = XEN_DOM_NUMA_INTERFACE_VERSION; >> + memop.u.dinfo.bufsize = sizeof(*pv_layout); >> + set_xen_guest_handle(memop.u.dinfo.buf, pv_layout); >> + >> + if ((rc = HYPERVISOR_memory_op(XENMEM_numa_op, &memop))) >> + { >> + printk(KERN_INFO "XEN NUMA GUEST:xen_get_domain_numa_layout >> failed\n"); >> + xen_start_info->flags &= ~SIF_NUMA_DOMAIN; >> + goto done; >> + } >> + >> + if (pv_layout->version != XEN_DOM_NUMA_INTERFACE_VERSION) >> + { >> + printk(KERN_INFO "XEN NUMA GUEST: version mismatch (disabling)\n"); >> + xen_start_info->flags &= ~SIF_NUMA_DOMAIN; >> + rc = -1; >> + } >> + xen_dump_numa_layout(pv_layout); >> +done: >> + return rc; >> +} >> + >> +static int __init >> +xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn) >> +{ >> + int rc = 0; >> + if (!xen_pv_domain() || !(xen_start_info->flags & SIF_NUMA_DOMAIN)) >> + { >> + rc = -1; >> + printk(KERN_INFO "xen numa emulation disabled\n"); >> + goto done; > > The tabs/spaces don't look right. Can you run this patch through > checkpatch.pl? Ok. > >> + } >> + if ((rc = xen_get_domain_numa_layout(&HYPERVISOR_pv_numa_layout))) >> + goto done; >> + rc = xen_numa_emulation(&HYPERVISOR_pv_numa_layout, start_pfn, >> last_pfn); >> +done: >> + return rc; >> +} >> +#else >> +static inline int __init >> +xen_pv_numa(unsigned long start_pfn, unsigned long last_pfn) >> +{ >> + return -1; >> +} >> +#endif /* CONFIG_XEN_NUMA_GUEST */ >> +/************************************************************************/ >> #endif /* CONFIG_NUMA_EMU */ >> >> void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) >> @@ -532,6 +777,9 @@ void __init initmem_init(unsigned long start_pfn, >> unsigned long last_pfn) >> nodes_clear(node_online_map); >> >> #ifdef CONFIG_NUMA_EMU >> + if (!xen_pv_numa(start_pfn, last_pfn)) >> + return; >> + >> if (cmdline && !numa_emulation(start_pfn, last_pfn)) >> return; >> nodes_clear(node_possible_map); >> diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig >> index 7675f9b..7cf6a4f 100644 >> --- a/arch/x86/xen/Kconfig >> +++ b/arch/x86/xen/Kconfig >> @@ -73,3 +73,12 @@ config XEN_PCI_PASSTHROUGH >> help >> Enable support for passing PCI devices through to >> unprivileged domains. (COMPLETELY UNTESTED) >> + >> +config XEN_NUMA_GUEST >> + bool "Enable support NUMA aware Xen domains" >> + depends on XEN && X86_64 && NUMA && NUMA_EMU >> + help >> + Enable support for NUMA aware Xen domains. With this >> + option, if the memory for the domain is allocated >> + from different memory nodes, the domain is made aware >> + of this through a Virtual NUMA enlightenment. >> diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c >> index df3e84c..2ee9f0b 100644 >> --- a/arch/x86/xen/setup.c >> +++ b/arch/x86/xen/setup.c >> @@ -285,6 +285,8 @@ void __init xen_arch_setup(void) >> printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); >> disable_acpi(); >> } >> + acpi_numa = -1; >> + numa_off = 1; >> #endif >> >> /* >> diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h >> index 32ab005..7f5b85e 100644 >> --- a/include/xen/interface/memory.h >> +++ b/include/xen/interface/memory.h >> @@ -240,6 +240,102 @@ DEFINE_GUEST_HANDLE_STRUCT(xen_memory_map); >> */ >> #define XENMEM_machine_memory_map 10 >> >> +/* xen guest numa operations */ >> +#define XENMEM_numa_op 15 >> + >> +#define XEN_DOM_NUMA_INTERFACE_VERSION 0x00000001 >> +#define XENCTL_NR_CPUS 64 >> +#define XENCTL_BITS_PER_BYTE 8 >> +#define XENCTL_BITS_TO_BYTES(bits) \ >> + (((bits)+XENCTL_BITS_PER_BYTE-1)/XENCTL_BITS_PER_BYTE) >> + >> +#define XENCTL_DECLARE_BITMAP(name,bits) \ >> + uint8_t name[XENCTL_BITS_TO_BYTES(bits)] >> +struct xenctl_cpumask{ XENCTL_DECLARE_BITMAP(bits, XENCTL_NR_CPUS); }; >> + >> +/* Not used in guest */ >> +#define XENMEM_machine_numa_layout 0x01 >> +struct xenmem_node_data { >> + uint32_t node_id; >> + uint64_t node_memsize; >> + uint64_t node_memfree; >> + struct xenctl_cpumask cpu_mask; /* node_to_cpumask */ >> +}; >> + >> +/* NUMA layout for the machine. >> + * Structure has to fit within a page. */ >> +struct xenmem_machine_numa_layout { >> + uint32_t max_nodes; >> + /* Only (max_nodes*max_nodes) entries are filled */ >> + GUEST_HANDLE(uint32_t) node_distance; >> + /* max_vnodes entries of xenmem_node_data type */ >> + GUEST_HANDLE(void) node_data; >> +}; >> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_numa_layout); >> + >> + >> +#define XENMEM_machine_nodemap 0x02 >> +struct xenmem_machine_nodemap { >> + /* On call the size of the available buffer */ >> + uint32_t bufsize; >> + >> + /* memnode map parameters */ >> + int32_t shift; >> + uint32_t mapsize; >> + GUEST_HANDLE(void) map; >> +}; >> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_machine_nodemap); >> + >> +/* NUMA layout for the domain at the time of startup. >> + * Structure has to fit within a page. */ >> +#define XENMEM_set_domain_numa_layout 0x03 >> +#define XENMEM_get_domain_numa_layout 0x04 >> + >> +/* NUMA layout for the domain at the time of startup. >> + * Structure has to fit within a page. */ >> +#define XEN_MAX_VNODES 8 >> + >> +struct xen_vnode_data { >> + uint32_t vnode_id; >> + uint32_t mnode_id; >> + uint64_t nr_pages; >> + struct xenctl_cpumask vcpu_mask; /* vnode_to_vcpumask */ >> +}; >> + >> +#define XEN_DOM_NUMA_CONFINED 0x01 >> +#define XEN_DOM_NUMA_SPLIT 0x02 >> +#define XEN_DOM_NUMA_STRIPED 0x03 >> +struct xen_domain_numa_layout { >> + uint32_t version; >> + uint32_t type; >> + >> + uint32_t max_vcpus; >> + uint32_t max_vnodes; >> + >> + /* Only (max_vnodes*max_vnodes) entries are filled */ >> + uint32_t vnode_distance[XEN_MAX_VNODES * XEN_MAX_VNODES]; >> + /* Only (max_vnodes) entries are filled */ >> + struct xen_vnode_data vnode_data[XEN_MAX_VNODES]; >> +}; >> + >> +struct xenmem_domain_numa_layout { >> + domid_t domid; >> + uint32_t version; >> + >> + uint32_t bufsize; >> + GUEST_HANDLE(void) buf; >> +}; >> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_domain_numa_layout); >> + >> +struct xenmem_numa_op { >> + uint32_t cmd; >> + union { >> + struct xenmem_machine_numa_layout minfo; >> + struct xenmem_machine_nodemap mnodemap; >> + struct xenmem_domain_numa_layout dinfo; >> + } u; >> +}; >> +DEFINE_GUEST_HANDLE_STRUCT(xenmem_numa_op); >> >> /* >> * Prevent the balloon driver from changing the memory reservation >> diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h >> index 9ffaee0..17cb17d 100644 >> --- a/include/xen/interface/xen.h >> +++ b/include/xen/interface/xen.h >> @@ -494,6 +494,8 @@ struct dom0_vga_console_info { >> /* These flags are passed in the 'flags' field of start_info_t. */ >> #define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ >> #define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ >> +#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */ >> +#define SIF_NUMA_DOMAIN (1<<3) /* Is the domain NUMA aware ? */ >> #define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ >> >> typedef uint64_t cpumap_t; >> @@ -504,6 +506,7 @@ typedef uint8_t xen_domain_handle_t[16]; >> #define __mk_unsigned_long(x) x ## UL >> #define mk_unsigned_long(x) __mk_unsigned_long(x) >> >> +DEFINE_GUEST_HANDLE(uint32_t); >> DEFINE_GUEST_HANDLE(uint64_t); >> >> #else /* __ASSEMBLY__ */ > >> _______________________________________________ >> Xen-devel mailing list >> Xen-devel@xxxxxxxxxxxxxxxxxxx >> http://lists.xensource.com/xen-devel > > _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |