[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest



On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote:
> Uses subop hypercall to request XEN about vnuma topology.
> Sets the memory blocks (aligned by XEN), cpus, distance table
> on boot. NUMA support should be compiled in kernel.

Needs a bit more details (which I am sure you will add in the
next postings). Mostly:
 - How E820 and NUMA information mesh
 - What the hypercall provides
 - What changeset in the Xen provides this hypercall
 - 
> 
> NUMA support should be compiled in kernel.
> 
> Memory blocks are aligned by xen. Xen is aware of guest initial
> RAM layout.
> If the memory blocks are incorrect, call for default linux numa
> dummy init.

Unfortunatly that won't happen if you boot this under dom0. It will
find on some AMD machines the AMD Northbridge and try to scan that.
And blow up.

If you look at the git commit that did the 'numa = 0' you will see
the backstory.

I think part of enablement of ' numa = 1' is to wrap it with

        if (xen_initial_domain() && xen_supports_this_hypercall())
                numa = 1;

in the #2 patch.

> 
> Requires XEN with vNUMA support.
> 
> Applies to 3.10 git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git
> latest commit: 8bb495e3f02401ee6f76d1b1d77f3ac9f079e376
> 
> TODO:
> Change subop from XENMEM to SYSCTL.
> 
> Signed-off-by: Elena Ufimtseva <ufimtseva@xxxxxxxxx>
> ---
>  arch/x86/include/asm/xen/vnuma-xen.h |   36 +++++++++++
>  arch/x86/mm/numa.c                   |    8 +++
>  arch/x86/xen/enlighten.c             |  115 
> ++++++++++++++++++++++++++++++++++
>  include/xen/interface/memory.h       |    1 +
>  4 files changed, 160 insertions(+)
>  create mode 100644 arch/x86/include/asm/xen/vnuma-xen.h
> 
> diff --git a/arch/x86/include/asm/xen/vnuma-xen.h 
> b/arch/x86/include/asm/xen/vnuma-xen.h
> new file mode 100644
> index 0000000..73c1bde
> --- /dev/null
> +++ b/arch/x86/include/asm/xen/vnuma-xen.h
> @@ -0,0 +1,36 @@
> +#ifndef _ASM_X86_VNUMA_XEN_H
> +#define _ASM_X86_VNUMA_XEN_H
> +
> +#ifdef CONFIG_XEN
> +int __initdata xen_numa_init(void);
> +
> +struct vnuma_memblk {
> +               uint64_t start, end;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_memblk);
> +
> +struct vnuma_topology_info {
> +     domid_t domid;
> +     uint16_t nr_nodes;
> +     GUEST_HANDLE(vnuma_memblk) vmemblk;
> +     GUEST_HANDLE(int) vdistance;
> +     GUEST_HANDLE(int) cpu_to_node;
> +     GUEST_HANDLE(int) vnode_to_pnode;

A comment explaining what those values are meant to have would be good.
Perhaps with a simple example.

> +};

The structure is not very 32-bit friendly. Would it be possible
to add some padding so that the size of this structure is the
same on 32-bit and 64-bit please?

Perhaps:
        domid_t domid;
        uint16_t nr_nodes;
        uint32_t _pad;
        GUEST_HANDLE(vnuma_memblk) vmemblk;
        GUEST_HANDLE(int) vdistance;
        GUEST_HANDLE(int) cpu_to_node;
        GUEST_HANDLE(int) vnode_to_pnode;

That should make the offsets be the same on both 32 and 64-bit
I think.

> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_topology_info);
> +
> +struct xen_domctl {
> +     uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
> +     domid_t  domain;
> +     union {
> +             struct vnuma_topology_info vnuma;
> +             } u;

Move that 'u' one tab back please.

> +};
> +typedef struct xen_domctl xen_domctl_t;

Ewww. No typdefs in the Linux code please.

> +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_t);
> +
> +#else
> +int __init xen_numa_init(void) {}
> +#endif
> +
> +#endif /* _ASM_X86_VNUMA_XEN_H */
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 8bf93ba..3ec7855 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -20,6 +20,10 @@
>  
>  #include "numa_internal.h"
>  
> +#ifdef CONFIG_XEN
> +#include "asm/xen/vnuma-xen.h"
> +#endif
> +
>  int __initdata numa_off;
>  nodemask_t numa_nodes_parsed __initdata;
>  
> @@ -621,6 +625,10 @@ static int __init dummy_numa_init(void)
>  void __init x86_numa_init(void)
>  {
>       if (!numa_off) {
> +#ifdef CONFIG_XEN
> +             if (xen_pv_domain() && !numa_init(xen_numa_init))

I would remove the xen_pv_domain() check and add that in enlighten.c code.

> +                     return;
> +#endif
>  #ifdef CONFIG_X86_NUMAQ
>               if (!numa_init(numaq_numa_init))
>                       return;
> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> index 193097e..4658e9d 100644
> --- a/arch/x86/xen/enlighten.c
> +++ b/arch/x86/xen/enlighten.c
> @@ -33,6 +33,7 @@
>  #include <linux/memblock.h>
>  #include <linux/edd.h>
>  
> +#include <asm/numa.h>
>  #include <xen/xen.h>
>  #include <xen/events.h>
>  #include <xen/interface/xen.h>
> @@ -69,6 +70,7 @@
>  #include <asm/mwait.h>
>  #include <asm/pci_x86.h>
>  #include <asm/pat.h>
> +#include <asm/xen/vnuma-xen.h>
>  
>  #ifdef CONFIG_ACPI
>  #include <linux/acpi.h>
> @@ -1750,4 +1752,117 @@ const struct hypervisor_x86 x86_hyper_xen_hvm 
> __refconst = {
>       .x2apic_available       = xen_x2apic_para_available,
>  };
>  EXPORT_SYMBOL(x86_hyper_xen_hvm);
> +
> +/* Xen PV NUMA topology initialization */
> +int __initdata xen_numa_init(void)
> +{
> +     struct vnuma_memblk *vblk;
> +     struct vnuma_topology_info numa_topo;
> +     uint64_t size, start, end;
> +     int i, j, cpu, rc;
> +     u64 phys, physd, physc;
> +     u64 new_end_pfn;
> +     size_t mem_size;
> +
> +     int *vdistance, *cpu_to_node;
> +     unsigned long dist_size, cpu_to_node_size;

You need an extra line here;

> +     numa_topo.domid = DOMID_SELF;
> +     rc = -EINVAL;

Both of those can be part of the decleration of variables above. Like so:
        int rc = -EINVAL;
        struct vnuma_topology_info numa_topo = {
                .domid = DOMID_SELF;
        };

> +     /* block mem reservation for memblks */
> +     mem_size = num_possible_cpus() * sizeof(struct vnuma_memblk);
> +     phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), mem_size, 
> PAGE_SIZE);
> +     if (!phys) {
> +             pr_info("vNUMA: can't allocate memory for membloks size %zu\n", 
> mem_size);
> +             rc = -ENOMEM;
> +             goto errout;
> +     }
> +     if (memblock_reserve(phys, mem_size) < 0) {
> +             pr_info("vNUMA: Cannot reserver mem for blocks of size %zu \n", 
> mem_size);
> +             rc = -ENOMEM;
> +             goto errout;
> +     }
> +     vblk = __va(phys);
> +     set_xen_guest_handle(numa_topo.vmemblk, vblk);
> +
> +     dist_size = num_possible_cpus() * num_possible_cpus() * 
> sizeof(*numa_topo.vdistance);
> +     physd = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), dist_size, 
> PAGE_SIZE);
> +     if (!physd) {
> +             pr_info("vNUMA: can't allocate memory for distance size %zu\n", 
> dist_size);
> +             rc = -ENOMEM;
> +             goto errout;
> +     }
> +     if (memblock_reserve(physd, dist_size) < 0) {
> +             pr_info("vNUMA: Cannot reserver mem for blocks of size %zu \n", 
> dist_size);
> +             rc = -ENOMEM;
> +             goto errout;
> +     }
> +     vdistance  = __va(physd);
> +     set_xen_guest_handle(numa_topo.vdistance, (int *)vdistance);
> +
> +     /* allocate memblock for vcpu to node mask of max size */
> +     cpu_to_node_size = num_possible_cpus() * sizeof(*numa_topo.cpu_to_node);
> +     physc = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), 
> cpu_to_node_size, PAGE_SIZE);
> +     if (!physc) {
> +             pr_info("vNUMA: can't allocate memory for distance size %zu\n", 
> cpu_to_node_size);
> +             rc = -ENOMEM;
> +             goto errout;
> +     }
> +     if (memblock_reserve(physc, cpu_to_node_size) < 0) {
> +             pr_info("vNUMA: Cannot reserver mem for blocks of size %zu\n", 
> cpu_to_node_size);
> +             goto errout;
> +     }
> +     cpu_to_node  = __va(physc);
> +     set_xen_guest_handle(numa_topo.cpu_to_node, (int *)cpu_to_node);
> +     if (HYPERVISOR_memory_op(XENMEM_get_vnuma_info, &numa_topo) < 0)

I think you need to do rc = HYPERVISOR...

and then if you get rc = -ENOSYS (so not implemented) change the rc to zero.

> +             goto errout;
> +     if (numa_topo.nr_nodes == 0)
> +             /* Pass to DUMMY numa */

Also change rc to zero.

> +             goto errout;
> +     if (numa_topo.nr_nodes > num_possible_cpus()) {
> +             pr_info("vNUMA: Node without cpu is not supported, nodes = 
> %d\n", numa_topo.nr_nodes);
> +             goto errout;
> +     }
> +     new_end_pfn = 0;
> +     /* We have sizes in pages received from hypervisor, no holes and 
> ordered */

From toolstack you mean. It is the one setting it up. And I hope it sets it up
based on the E820 it has constructed as well? Otherwise it would be a bit
awkward if those two were different.

> +     for (i = 0; i < numa_topo.nr_nodes; i++) {
> +             start = vblk[i].start;
> +             end = vblk[i].end;
> +             size = end - start;
> +             pr_info("VNMA blk[%d] size is %#010Lx start = %#010Lx end = 
> %#010Lx\n",
> +                     i, size, start, end);

pr_debug perhaps?

> +             if (numa_add_memblk(i, start, end)) {
> +                     pr_info("vNUMA: Could not add memblk[%d]\n", i);
> +                     rc = -EAGAIN;

Can you unroll the NUMA topology if this fails? Or is that not a problem?

> +                     goto errout;
> +             }
> +             node_set(i, numa_nodes_parsed);
> +     }
> +     setup_nr_node_ids();
> +     /* Setting the cpu, apicid to node */
> +     for_each_cpu(cpu, cpu_possible_mask) {
> +             pr_info("Setting up vcpu[%d] to node[%d]\n", cpu, 
> cpu_to_node[cpu]);

pr_debug
> +             set_apicid_to_node(cpu, cpu_to_node[cpu]);
> +             numa_set_node(cpu, cpu_to_node[cpu]);
> +             __apicid_to_node[cpu] = cpu_to_node[cpu];
> +             cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node[cpu]]);
> +     }
> +     setup_nr_node_ids();

How come this is being called twice? It should have a comment
explaining this extra call.

> +     rc = 0;
> +     for (i = 0; i < numa_topo.nr_nodes; i++)
> +             for (j = 0; j < numa_topo.nr_nodes; j++) {
> +                     numa_set_distance(i, j, *(vdistance + 
> j*numa_topo.nr_nodes + i));

You could simply this by just doing:

                        int idx = (j * numa_topo.nr_nodes) + i;

                        ... *(vdistance + idx));

> +                     pr_info("distance %d <-> %d = %d\n", i, j, *(vdistance 
> + j*numa_topo.nr_nodes + i));

pr_debug

> +             }
> +errout:
> +     if (phys)
> +             memblock_free(__pa(phys), mem_size);
> +     if (physd)
> +             memblock_free(__pa(physd), dist_size);
> +     if (physc)
> +             memblock_free(__pa(physc), cpu_to_node_size);
> +     if (rc)
> +             pr_debug("XEN: hypercall failed to get vNUMA topology.\n");

And include the 'rc' value in the output please.
> +     return rc;
> +}
> +
>  #endif
> diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
> index 2ecfe4f..16b8d87 100644
> --- a/include/xen/interface/memory.h
> +++ b/include/xen/interface/memory.h
> @@ -263,4 +263,5 @@ struct xen_remove_from_physmap {
>  };
>  DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap);
>  
> +#define XENMEM_get_vnuma_info        25
>  #endif /* __XEN_PUBLIC_MEMORY_H__ */
> -- 
> 1.7.10.4
> 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.