[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] x86: fix NUMA handling (c/s 20599)



c/s 20599 caused the hash shift to become significantly smaller on
systems with an SRAT like this

(XEN) SRAT: Node 0 PXM 0 0-a0000
(XEN) SRAT: Node 0 PXM 0 100000-80000000
(XEN) SRAT: Node 1 PXM 1 80000000-d0000000
(XEN) SRAT: Node 1 PXM 1 100000000-130000000

Comined with the static size of the memnodemap[] array, NUMA got
therefore disabled on such systems. The backport from Linux was really
incomplete, as Linux much earlier had already introduced a dynamcially
allocated memnodemap[].

Further, doing to/from pdx translations on addresses just past a valid
range is not correct, as it may strip/fail to insert non-zero bits in
this case.

Finally, using 63 as the cover-it-all shift value is invalid on 32bit,
since pdx values are unsigned long.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxxxx>

--- 2010-01-06.orig/xen/arch/x86/numa.c 2010-01-06 15:16:18.000000000 +0100
+++ 2010-01-06/xen/arch/x86/numa.c      2010-01-06 16:10:59.000000000 +0100
@@ -30,7 +30,9 @@ struct node_data node_data[MAX_NUMNODES]
 
 /* Mapping from pdx to node id */
 int memnode_shift;
-u8  memnodemap[NODEMAPSIZE];
+static typeof(*memnodemap) _memnodemap[2];
+unsigned long memnodemapsize;
+u8 *memnodemap;
 
 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
        [0 ... NR_CPUS-1] = NUMA_NO_NODE
@@ -62,13 +64,13 @@ static int __init populate_memnodemap(co
        unsigned long spdx, epdx;
        int i, res = -1;
 
-       memset(memnodemap, NUMA_NO_NODE, sizeof(memnodemap));
+       memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap));
        for (i = 0; i < numnodes; i++) {
                spdx = paddr_to_pdx(nodes[i].start);
-               epdx = paddr_to_pdx(nodes[i].end);
+               epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
                if (spdx >= epdx)
                        continue;
-               if ((epdx >> shift) >= NODEMAPSIZE)
+               if ((epdx >> shift) >= memnodemapsize)
                        return 0;
                do {
                        if (memnodemap[spdx >> shift] != NUMA_NO_NODE)
@@ -86,6 +88,28 @@ static int __init populate_memnodemap(co
        return res;
 }
 
+static int __init allocate_cachealigned_memnodemap(void)
+{
+       unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap));
+       unsigned long mfn = alloc_boot_pages(size, 1);
+
+       if (!mfn) {
+               printk(KERN_ERR
+                      "NUMA: Unable to allocate Memory to Node hash map\n");
+               memnodemapsize = 0;
+               return -1;
+       }
+
+       memnodemap = mfn_to_virt(mfn);
+       mfn <<= PAGE_SHIFT;
+       size <<= PAGE_SHIFT;
+       printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+              mfn, mfn + size);
+       memnodemapsize = size / sizeof(*memnodemap);
+
+       return 0;
+}
+
 /*
  * The LSB of all start and end addresses in the node map is the value of the
  * maximum possible shift.
@@ -99,7 +123,7 @@ static int __init extract_lsb_from_nodes
 
        for (i = 0; i < numnodes; i++) {
                spdx = paddr_to_pdx(nodes[i].start);
-               epdx = paddr_to_pdx(nodes[i].end);
+               epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
                if (spdx >= epdx)
                        continue;
                bitfield |= spdx;
@@ -108,9 +132,10 @@ static int __init extract_lsb_from_nodes
                        memtop = epdx;
        }
        if (nodes_used <= 1)
-               i = 63;
+               i = BITS_PER_LONG - 1;
        else
                i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+       memnodemapsize = (memtop >> i) + 1;
        return i;
 }
 
@@ -120,6 +145,10 @@ int __init compute_hash_shift(struct nod
        int shift;
 
        shift = extract_lsb_from_nodes(nodes, numnodes);
+       if (memnodemapsize <= ARRAY_SIZE(_memnodemap))
+               memnodemap = _memnodemap;
+       else if (allocate_cachealigned_memnodemap())
+               return -1;
        printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
                shift);
 
@@ -233,8 +262,8 @@ void __init numa_initmem_init(unsigned l
               (u64)start_pfn << PAGE_SHIFT,
               (u64)end_pfn << PAGE_SHIFT);
        /* setup dummy node covering all memory */ 
-       memnode_shift = 63; 
-       memnodemap[0] = 0;
+       memnode_shift = BITS_PER_LONG - 1;
+       memnodemap = _memnodemap;
        nodes_clear(node_online_map);
        node_set_online(0);
        for (i = 0; i < NR_CPUS; i++)
--- 2010-01-06.orig/xen/include/asm-x86/numa.h  2010-01-06 15:16:18.000000000 
+0100
+++ 2010-01-06/xen/include/asm-x86/numa.h       2010-01-06 15:15:41.000000000 
+0100
@@ -25,7 +25,6 @@ extern int pxm_to_node(int nid);
 
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 #define VIRTUAL_BUG_ON(x) 
-#define NODEMAPSIZE 0xfff
 
 extern void numa_add_cpu(int cpu);
 extern void numa_init_array(void);
@@ -51,7 +50,8 @@ static inline void clear_node_cpumask(in
 
 /* Simple perfect hash to map pdx to node numbers */
 extern int memnode_shift; 
-extern u8  memnodemap[NODEMAPSIZE]; 
+extern unsigned long memnodemapsize;
+extern u8 *memnodemap;
 
 struct node_data {
     unsigned long node_start_pfn;
@@ -64,7 +64,7 @@ extern struct node_data node_data[];
 static inline __attribute__((pure)) int phys_to_nid(paddr_t addr) 
 { 
        unsigned nid;
-       VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= NODEMAPSIZE);
+       VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= memnodemapsize);
        nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift]; 
        VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); 
        return nid; 


Attachment: xen-x86-numa-node-map.patch
Description: Text document

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.