[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v3] Xen: Spread boot time page scrubbing across all available CPU's



From: Malcolm Crossley <malcolm.crossley@xxxxxxxxxx>

The page scrubbing is done in 128MB chunks in lockstep across all the CPU's.
This allows for the boot CPU to hold the heap_lock whilst each chunk is being
scrubbed and then release the heap_lock when all CPU's are finished scrubing
their individual chunk. This allows for the heap_lock to not be held
continously and for pending softirqs are to be serviced periodically across
all CPU's.

The page scrub memory chunks are allocated to the CPU's in a NUMA aware
fashion to reduce socket interconnect overhead and improve performance.
Specifically in the first phase we scrub at the same time on all the
NUMA nodes that have CPUs - we also weed out the SMT threads so that
we only use cores (that gives a 50% boost). The second phase is to use
all of the CPUs for the NUMA nodes that have no CPUs.

This patch reduces the boot page scrub time on a 128GB 64 core AMD Opteron
6386 machine from 49 seconds to 3 seconds.
On a IvyBridge-EX 8 socket box with 1.5TB it cuts it down from 15 minutes
to 117 seconds.

v2
 - Reduced default chunk size to 128MB
 - Added code to scrub NUMA nodes with no active CPU linked to them
 - Be robust to boot CPU not being linked to a NUMA node

v3:
 - Don't use SMT threads
 - Take care of remainder if the number of CPUs (or memory) is odd
 - Restructure the worker thread
 - s/u64/unsigned long/

Signed-off-by: Malcolm Crossley <malcolm.crossley@xxxxxxxxxx>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
---
 docs/misc/xen-command-line.markdown |   10 ++
 xen/common/page_alloc.c             |  177 +++++++++++++++++++++++++++++++----
 2 files changed, 167 insertions(+), 20 deletions(-)

diff --git a/docs/misc/xen-command-line.markdown 
b/docs/misc/xen-command-line.markdown
index 87de2dc..a7da227 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -197,6 +197,16 @@ Scrub free RAM during boot.  This is a safety feature to 
prevent
 accidentally leaking sensitive VM data into other VMs if Xen crashes
 and reboots.
 
+### bootscrub_chunk_
+> `= <size>`
+
+> Default: `128MiB`
+
+Maximum RAM block size chunks to be scrubbed whilst holding the page heap lock
+and not running softirqs. Reduce this if softirqs are not being run frequently
+enough. Setting this to a high value may cause cause boot failure, particularly
+if the NMI watchdog is also enabled.
+
 ### cachesize
 > `= <size>`
 
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 601319c..3ad6e1d 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -65,6 +65,12 @@ static bool_t opt_bootscrub __initdata = 1;
 boolean_param("bootscrub", opt_bootscrub);
 
 /*
+ * bootscrub_blocksize -> Size (bytes) of mem block to scrub with heaplock held
+ */
+static unsigned int __initdata opt_bootscrub_chunk = 128 * 1024 * 1024;
+size_param("bootscrub_chunk", opt_bootscrub_chunk);
+
+/*
  * Bit width of the DMA heap -- used to override NUMA-node-first.
  * allocation strategy, which can otherwise exhaust low memory.
  */
@@ -90,6 +96,16 @@ static struct bootmem_region {
 } *__initdata bootmem_region_list;
 static unsigned int __initdata nr_bootmem_regions;
 
+struct scrub_region {
+    unsigned long offset;
+    unsigned long start;
+    unsigned long per_cpu_sz;
+    unsigned long rem;
+    cpumask_t cpu;
+};
+static struct scrub_region __initdata region[MAX_NUMNODES];
+static unsigned long __initdata chunk_size;
+
 static void __init boot_bug(int line)
 {
     panic("Boot BUG at %s:%d", __FILE__, line);
@@ -1256,45 +1272,166 @@ void __init end_boot_allocator(void)
     printk("\n");
 }
 
+void __init smp_scrub_heap_pages(void *data)
+{
+    unsigned long mfn, start, end;
+    struct page_info *pg;
+    struct scrub_region *r;
+    unsigned int temp_cpu, node, cpu_idx = 0;
+    unsigned int cpu = smp_processor_id();
+
+    if ( data )
+        r = data;
+    else {
+        node = cpu_to_node(cpu);
+        if ( node == NUMA_NO_NODE )
+            return;
+        r = &region[node];
+    }
+    ASSERT(r != NULL);
+
+    /* Determine the current CPU's index into CPU's linked to this node*/
+    for_each_cpu ( temp_cpu, &r->cpu )
+    {
+        if ( cpu == temp_cpu )
+            break;
+        cpu_idx++;
+    }
+
+    /* Calculate the starting mfn for this CPU's memory block */
+    start = r->start + (r->per_cpu_sz * cpu_idx) + r->offset;
+
+    /* Calculate the end mfn into this CPU's memory block for this iteration */
+    if ( r->offset + chunk_size > r->per_cpu_sz ) {
+        end = r->start + (r->per_cpu_sz * cpu_idx) + r->per_cpu_sz;
+        if ( r->rem && ((cpumask_weight(&r->cpu) - 1 == cpu_idx )) )
+            end += r->rem;
+    }
+    else
+        end = start + chunk_size;
+
+    for ( mfn = start; mfn < end; mfn++ )
+    {
+        pg = mfn_to_page(mfn);
+
+        /* Check the mfn is valid and page is free. */
+        if ( !mfn_valid(mfn) || !page_state_is(pg, free) )
+            continue;
+
+        scrub_one_page(pg);
+    }
+    wmb();
+}
+
 /*
- * Scrub all unallocated pages in all heap zones. This function is more
- * convoluted than appears necessary because we do not want to continuously
- * hold the lock while scrubbing very large memory areas.
+ * Scrub all unallocated pages in all heap zones. This function uses all
+ * online cpu's to scrub the memory in parallel.
  */
 void __init scrub_heap_pages(void)
 {
-    unsigned long mfn;
-    struct page_info *pg;
+    cpumask_t node_cpus, temp_cpus, all_worker_cpus = {{ 0 }};
+    unsigned int i, j, cpu, sibling;
+    unsigned long offset, max_per_cpu_sz = 0;
+    unsigned long start, end;
+    unsigned long rem = 0;
 
     if ( !opt_bootscrub )
         return;
 
-    printk("Scrubbing Free RAM: ");
+    /* Scrub block size */
+    chunk_size = opt_bootscrub_chunk >> PAGE_SHIFT;
+    if ( chunk_size == 0 )
+        chunk_size = 1;
 
-    for ( mfn = first_valid_mfn; mfn < max_page; mfn++ )
+    /* Round #0 - figure out amounts and which CPUs to use */
+    for_each_online_node ( i )
     {
+        /* Calculate Node memory start and end address */
+        start = max(node_start_pfn(i), first_valid_mfn);
+        end = min(node_start_pfn(i) + node_spanned_pages(i), max_page);
+        /* CPUs that are online and on this node (if none, that it is OK */
+        cpumask_and(&node_cpus, &node_to_cpumask(i), &cpu_online_map);
+        cpumask_copy(&temp_cpus, &node_cpus);
+        /* Rip out threads. */
+        for_each_cpu ( j, &temp_cpus )
+        {
+            cpu = 0;
+            for_each_cpu(sibling, per_cpu(cpu_sibling_mask, j)) {
+                if (cpu++ == 0) /* Skip 1st CPU - the core */
+                    continue;
+                cpumask_clear_cpu(sibling, &node_cpus);
+            }
+        }
+        cpumask_or(&all_worker_cpus, &all_worker_cpus, &node_cpus);
+        if ( cpumask_empty(&node_cpus) ) { /* No CPUs on this node. */
+            rem = 0;
+            region[i].per_cpu_sz = (end - start);
+        } else {
+            rem = (end - start) % cpumask_weight(&node_cpus);
+            region[i].per_cpu_sz = (end - start) / cpumask_weight(&node_cpus);
+            if ( region[i].per_cpu_sz > max_per_cpu_sz )
+                max_per_cpu_sz = region[i].per_cpu_sz;
+        }
+        region[i].start = start;
+        region[i].rem = rem;
+        cpumask_copy(&region[i].cpu, &node_cpus);
+
+    }
+    cpu = smp_processor_id();
+    /* Round default chunk size down if required */
+    if ( max_per_cpu_sz && chunk_size > max_per_cpu_sz )
+        chunk_size = max_per_cpu_sz;
+
+    printk("Scrubbing Free RAM on %u nodes using %u CPUs: ", 
num_online_nodes(),
+           cpumask_weight(&all_worker_cpus));
+
+    /* Round: #1 - do NUMA nodes with CPUs */
+    for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size )
+    {
+        for_each_online_node ( i )
+            region[i].offset = offset;
+
         process_pending_softirqs();
 
-        pg = mfn_to_page(mfn);
+        spin_lock(&heap_lock);
+        on_selected_cpus(&all_worker_cpus, smp_scrub_heap_pages, NULL, 1);
+        spin_unlock(&heap_lock);
 
-        /* Quick lock-free check. */
-        if ( !mfn_valid(mfn) || !page_state_is(pg, free) )
+        printk(".");
+    }
+
+    /* Round #2: NUMA nodes with no CPUs get scrubbed with all CPUs. */
+    for_each_online_node ( i )
+    {
+        node_cpus = node_to_cpumask(i);
+
+        if ( !cpumask_empty(&node_cpus) )
             continue;
 
-        /* Every 100MB, print a progress dot. */
-        if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
-            printk(".");
+        /* We already have the node information from round #0 */
+        end = region[i].start + region[i].per_cpu_sz;
+        rem = region[i].per_cpu_sz % cpumask_weight(&all_worker_cpus);
 
-        spin_lock(&heap_lock);
+        region[i].rem = rem;
+        region[i].per_cpu_sz /= cpumask_weight(&all_worker_cpus);
+        max_per_cpu_sz = region[i].per_cpu_sz;
+        if ( max_per_cpu_sz && chunk_size > max_per_cpu_sz )
+            chunk_size = max_per_cpu_sz;
+        cpumask_copy(&region[i].cpu, &all_worker_cpus);
 
-        /* Re-check page status with lock held. */
-        if ( page_state_is(pg, free) )
-            scrub_one_page(pg);
+        for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size )
+        {
+            region[i].offset = offset;
 
-        spin_unlock(&heap_lock);
-    }
+            process_pending_softirqs();
+
+            spin_lock(&heap_lock);
+            on_selected_cpus(&all_worker_cpus, smp_scrub_heap_pages, 
&region[i], 1);
+            spin_unlock(&heap_lock);
 
-    printk("done.\n");
+            printk(".");
+        }
+    }
 
     /* Now that the heap is initialized, run checks and set bounds
      * for the low mem virq algorithm. */
-- 
1.7.7.6


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.