[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH RFC v2 1/2] linux/vnuma: vNUMA for PV domu guest



On Tue, Sep 17, 2013 at 10:21 AM, Boris Ostrovsky
<boris.ostrovsky@xxxxxxxxxx> wrote:
> On 09/17/2013 04:34 AM, Elena Ufimtseva wrote:
>>
>> Requests NUMA topology info from Xen by issuing subop
>> hypercall. Initializes NUMA nodes, sets number of CPUs,
>> distance table and NUMA nodes memory ranges during boot.
>> vNUMA topology defined by user in VM config file. Memory
>> ranges are represented by structure vnuma_topology_info
>> where start and end of memory area are defined in guests
>> pfns numbers, constructed and aligned accordingly to
>> e820 domain map.
>> In case the received structure has errors, will fail to
>> dummy numa init.
>> Requires XEN with applied patches from vnuma patchset;
>>
>> Changes since v1:
>> - moved the test for xen_pv_domain() into xen_numa_init;
>> - replaced memory block search/allocation by single memblock_alloc;
>> - moved xen_numa_init to vnuma.c from enlighten.c;
>> - moved memblock structure to public interface memory.h;
>> - specified signedness of vnuma topology structure members;
>> - removed excessive debug output;
>>
>> TODO:
>> - consider common interface for Dom0, HVM and PV guests to provide
>> vNUMA topology;
>> - dynamic numa balancing at the time of this patch (kernel 3.11
>> 6e4664525b1db28f8c4e1130957f70a94c19213e with boot parameter
>> numa_balancing=true that is such by default) crashes numa-enabled
>> guest. Investigate further.
>>
>> Signed-off-by: Elena Ufimtseva <ufimtseva@xxxxxxxxx>
>> ---
>>   arch/x86/include/asm/xen/vnuma.h |   12 +++++
>>   arch/x86/mm/numa.c               |    5 +++
>>   arch/x86/xen/Makefile            |    2 +-
>>   arch/x86/xen/vnuma.c             |   92
>> ++++++++++++++++++++++++++++++++++++++
>>   include/xen/interface/memory.h   |   27 +++++++++++
>>   5 files changed, 137 insertions(+), 1 deletion(-)
>>   create mode 100644 arch/x86/include/asm/xen/vnuma.h
>>   create mode 100644 arch/x86/xen/vnuma.c
>>
>> diff --git a/arch/x86/include/asm/xen/vnuma.h
>> b/arch/x86/include/asm/xen/vnuma.h
>> new file mode 100644
>> index 0000000..1bf4cae
>> --- /dev/null
>> +++ b/arch/x86/include/asm/xen/vnuma.h
>> @@ -0,0 +1,12 @@
>> +#ifndef _ASM_X86_VNUMA_H
>> +#define _ASM_X86_VNUMA_H
>> +
>> +#ifdef CONFIG_XEN
>> +int xen_vnuma_support(void);
>> +int xen_numa_init(void);
>> +#else
>> +int xen_vnuma_support(void) { return 0; };
>> +int xen_numa_init(void) {};
>
>
> This should return -EINVAL. Or perhaps you can add
>
> #ifdef CONFIG_XEN
> #include "asm/xen/vnuma.h"
> #endif
>
> in numa.c  and not bother with ifdef here.


>
>
>> +#endif
>> +
>> +#endif /* _ASM_X86_VNUMA_H */
>> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
>> index 8bf93ba..a95fadf 100644
>> --- a/arch/x86/mm/numa.c
>> +++ b/arch/x86/mm/numa.c
>> @@ -19,6 +19,7 @@
>>   #include <asm/amd_nb.h>
>>     #include "numa_internal.h"
>> +#include "asm/xen/vnuma.h"
>>     int __initdata numa_off;
>>   nodemask_t numa_nodes_parsed __initdata;
>> @@ -621,6 +622,10 @@ static int __init dummy_numa_init(void)
>>   void __init x86_numa_init(void)
>>   {
>>         if (!numa_off) {
>> +#ifdef CONFIG_XEN
>> +               if (!numa_init(xen_numa_init))
>> +                       return;
>> +#endif
>>   #ifdef CONFIG_X86_NUMAQ
>>                 if (!numa_init(numaq_numa_init))
>>                         return;
>> diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
>> index 96ab2c0..de9deab 100644
>> --- a/arch/x86/xen/Makefile
>> +++ b/arch/x86/xen/Makefile
>> @@ -13,7 +13,7 @@ CFLAGS_mmu.o                  := $(nostackp)
>>   obj-y         := enlighten.o setup.o multicalls.o mmu.o irq.o \
>>                         time.o xen-asm.o xen-asm_$(BITS).o \
>>                         grant-table.o suspend.o platform-pci-unplug.o \
>> -                       p2m.o
>> +                       p2m.o vnuma.o
>>     obj-$(CONFIG_EVENT_TRACING) += trace.o
>>   diff --git a/arch/x86/xen/vnuma.c b/arch/x86/xen/vnuma.c
>> new file mode 100644
>> index 0000000..3c6c73f
>> --- /dev/null
>> +++ b/arch/x86/xen/vnuma.c
>> @@ -0,0 +1,92 @@
>> +#include <linux/err.h>
>> +#include <linux/memblock.h>
>> +#include <xen/interface/xen.h>
>> +#include <xen/interface/memory.h>
>> +#include <asm/xen/interface.h>
>> +#include <asm/xen/hypercall.h>
>> +#include <asm/xen/vnuma.h>
>> +#ifdef CONFIG_NUMA
>> +/* Xen PV NUMA topology initialization */
>> +static unsigned int xen_vnuma_init = 0;
>> +int xen_vnuma_support()
>> +{
>> +       return xen_vnuma_init;
>> +}
>> +int __init xen_numa_init(void)
>> +{
>> +       int rc;
>> +       unsigned int i, j, cpu, idx, pcpus;
>> +       u64 phys, physd, physc;
>> +       unsigned int *vdistance, *cpu_to_node;
>
>
> cpu_to_node may not be a particularly good name as there is a macro with
> the same name in topology.h

Sure, will fix.
>
>
>> +       unsigned long mem_size, dist_size, cpu_to_node_size;
>> +       struct vnuma_memarea *varea;
>> +
>> +       struct vnuma_topology_info numa_topo = {
>> +               .domid = DOMID_SELF
>> +       };
>> +       rc = -EINVAL;
>> +       if (!xen_pv_domain())
>> +               return rc;
>
>
> No need to set rc here, just return -EINVAL;
>
> And please add spaces between lines to separate logical blocks a little.\
Ok.
>
>
>> +       pcpus = num_possible_cpus();
>> +       mem_size =  pcpus * sizeof(struct vnuma_memarea);
>> +       dist_size = pcpus * pcpus * sizeof(*numa_topo.vdistance);
>> +       cpu_to_node_size = pcpus * sizeof(*numa_topo.cpu_to_node);
>> +       phys = memblock_alloc(mem_size, PAGE_SIZE);
>> +       physd = memblock_alloc(dist_size, PAGE_SIZE);
>> +       physc = memblock_alloc(cpu_to_node_size, PAGE_SIZE);
>> +       if (!phys || !physc || !physd)
>> +               goto vnumaout;
>> +       varea = __va(phys);
>> +       vdistance  = __va(physd);
>> +       cpu_to_node  = __va(physc);
>> +       set_xen_guest_handle(numa_topo.vmemarea, varea);
>> +       set_xen_guest_handle(numa_topo.vdistance, vdistance);
>> +       set_xen_guest_handle(numa_topo.cpu_to_node, cpu_to_node);
>> +       rc = HYPERVISOR_memory_op(XENMEM_get_vnuma_info, &numa_topo);
>> +       if (rc < 0)
>> +               goto vnumaout;
>> +       rc = -EINVAL;
>> +       if (numa_topo.nr_nodes == 0) {
>> +               /* will pass to dummy_numa_init */
>> +               goto vnumaout;
>> +       }
>> +       if (numa_topo.nr_nodes > num_possible_cpus()) {
>> +               pr_debug("vNUMA: Node without cpu is not supported in this
>> version.\n");
>> +               goto vnumaout;
>> +       }
>> +       /*
>> +        * NUMA nodes memory ranges are in pfns, constructed and
>> +        * aligned based on e820 ram domain map
>> +       */
>> +       for (i = 0; i < numa_topo.nr_nodes; i++) {
>> +               if (numa_add_memblk(i, varea[i].start, varea[i].end))
>> +                       /* pass to numa_dummy_init */
>> +                       goto vnumaout;
>> +               node_set(i, numa_nodes_parsed);
>> +       }
>> +       setup_nr_node_ids();
>> +       /* Setting the cpu, apicid to node */
>> +       for_each_cpu(cpu, cpu_possible_mask) {
>> +               set_apicid_to_node(cpu, cpu_to_node[cpu]);
>> +               numa_set_node(cpu, cpu_to_node[cpu]);
>> +               __apicid_to_node[cpu] = cpu_to_node[cpu];
>
>
> Isn't this what set_apicid_to_node() above will do?

Yes, exactly the same ) will fix.
>
> -boris
>
>
>> +               cpumask_set_cpu(cpu,
>> node_to_cpumask_map[cpu_to_node[cpu]]);
>> +       }
>> +       for (i = 0; i < numa_topo.nr_nodes; i++) {
>> +               for (j = 0; j < numa_topo.nr_nodes; j++) {
>> +                       idx = (j * numa_topo.nr_nodes) + i;
>> +                       numa_set_distance(i, j, *(vdistance + idx));
>> +               }
>> +       }
>> +       rc = 0;
>> +       xen_vnuma_init = 1;
>> +vnumaout:
>> +       if (phys)
>> +               memblock_free(__pa(phys), mem_size);
>> +       if (physd)
>> +               memblock_free(__pa(physd), dist_size);
>> +       if (physc)
>> +               memblock_free(__pa(physc), cpu_to_node_size);
>> +       return rc;
>> +}
>> +#endif
>> diff --git a/include/xen/interface/memory.h
>> b/include/xen/interface/memory.h
>> index 2ecfe4f..4237f51 100644
>> --- a/include/xen/interface/memory.h
>> +++ b/include/xen/interface/memory.h
>> @@ -263,4 +263,31 @@ struct xen_remove_from_physmap {
>>   };
>>   DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap);
>>   +/* vNUMA structures */
>> +struct vnuma_memarea {
>> +       uint64_t start, end;
>> +};
>> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_memarea);
>> +
>> +struct vnuma_topology_info {
>> +       /* OUT */
>> +       domid_t domid;
>> +       /* IN */
>> +       uint16_t nr_nodes; /* number of virtual numa nodes */
>> +       uint32_t _pad;
>> +       /* distance table */
>> +       GUEST_HANDLE(uint) vdistance;
>> +       /* cpu mapping to vnodes */
>> +       GUEST_HANDLE(uint) cpu_to_node;
>> +       /*
>> +       * array of numa memory areas constructed by Xen
>> +       * where start and end are pfn numbers of the area
>> +       * Xen takes into account domains e820 map
>> +       */
>> +       GUEST_HANDLE(vnuma_memarea) vmemarea;
>> +};
>> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_topology_info);
>> +
>> +#define XENMEM_get_vnuma_info  25
>> +
>>   #endif /* __XEN_PUBLIC_MEMORY_H__ */
>
>



-- 
Elena

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.