[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 19/21] xen/arm: Balance Dom0 memory allocation across allowed NUMA nodes


  • To: xen-devel@xxxxxxxxxxxxxxxxxxxx
  • From: Hirokazu Takahashi <taka@xxxxxxxxxxxxx>
  • Date: Sun, 24 May 2026 09:02:07 +0900
  • Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=valinux.co.jp; dmarc=pass action=none header.from=valinux.co.jp; dkim=pass header.d=valinux.co.jp; arc=none
  • Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector10001; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=JFwOGfPyzWG9F7U0qyfrx8XENPksC2lqRTWEjzBLvZU=; b=kwOPPbsOkm/Ym8+lBSwC/G52Fk4zYAxLa8TQYhx37D0vCymYWlMPkquAcugVmeHP8PQBV55KgTnqE1DllSxXNtX3DZlXobiWLG1gS170SX0lKLtT6JvFCLTJmio40i6AtIuCTZ1w1ro3VshMvG96ltxdn7c1dUGbvwNmqGzE+0ZaQOnpzjd4s7GeYHJOdY3qVOd/EKkGcci2yjHuwKbnMBSbgsZnbzVm6Yo7G74K3/QfwEl6CITn7jmAZm6gYNkDfxiqS/cm13SoaOTLc9l0ksOb3OwcmYwk3BXf3S3DuZlLzVTdIhfI9o5CqeKdtTnlaOGQgbJ0HWE6XSNwp2iqJQ==
  • Arc-seal: i=1; a=rsa-sha256; s=arcselector10001; d=microsoft.com; cv=none; b=bbMPEwJbRaFnrcaQgbQ31+GBmkN1wqmv1jN0WiLSbxZPGxK1gp8UiazRoQMkIp8i/+s/AHAZiWuAQbx0AVfiE9bMCXZeG8GJOCwCE4mC/Bh5fSeU0GW35rCtC+ehRDXOOuNPvgTYNZI5wElzYnRMNpQVfjgejI/44xisxTjFm461xE+sw2vqvEa4mPmLCEsAILwxRaMrbI47U7kwoWUPWt4wgiiVV2A50QrDSXP7cDQIveRbNXnmle3r/xiIGomolfIH4OwzVUxkgymEPPB5B6+bXntUKdJLBHKEup8c4a5GKXKLeV42TDWZsc7gxqQf5JKqjv0uxQJN0jp2+FXBPg==
  • Authentication-results: eu.smtp.expurgate.cloud; dkim=pass header.s=selector1 header.d=valinux.co.jp header.i="@valinux.co.jp" header.h="From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck"
  • Authentication-results: dkim=none (message not signed) header.d=none;dmarc=none action=none header.from=valinux.co.jp;
  • Cc: andrew.cooper3@xxxxxxxxxx, anthony.perard@xxxxxxxxxx, michal.orzel@xxxxxxx, jbeulich@xxxxxxxx, julien@xxxxxxx, roger.pau@xxxxxxxxxx, sstabellini@xxxxxxxxxx, jgross@xxxxxxxx, bertrand.marquis@xxxxxxx, Volodymyr_Babchuk@xxxxxxxx, dfaggioli@xxxxxxxx, gwd@xxxxxxxxxxxxxx, Hirokazu Takahashi <taka@xxxxxxxxxxxxx>
  • Delivery-date: Sun, 24 May 2026 00:03:16 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

Allocate memory for Domain-0 exclusively from the permitted NUMA nodes.
When multiple NUMA nodes are available, distribute the allocation in a
balanced manner across each of these nodes.
---
 xen/arch/arm/domain_build.c | 275 ++++++++++++++++++++----------------
 1 file changed, 150 insertions(+), 125 deletions(-)

diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
index 2bf4b37f89..7960dcd33a 100644
--- a/xen/arch/arm/domain_build.c
+++ b/xen/arch/arm/domain_build.c
@@ -180,33 +180,19 @@ unsigned int __init dom0_max_vcpus(void)
 static bool __init insert_11_bank(struct domain *d,
                                   struct kernel_info *kinfo,
                                   struct page_info *pg,
-                                  unsigned int order)
+                                  unsigned int order,
+                                  nodeid_t node)
 {
     struct membanks *mem = kernel_info_get_mem(kinfo);
     unsigned int i;
     int res;
     mfn_t smfn;
     paddr_t start, size;
-    nodeid_t node = 0U;
 
     smfn = page_to_mfn(pg);
     start = mfn_to_maddr(smfn);
     size = pfn_to_paddr(1UL << order);
 
-    /* This code is temporal */
-    {
-        struct membanks *mem = bootinfo_get_mem();
-        for ( i = 0; i < mem->nr_banks; i++ )
-        {
-            if ( start >= mem->bank[i].start &&
-                 start < (mem->bank[i].start + mem->bank[i].size) )
-            {
-                node = get_numa_nodeid(&mem->bank[i]);
-                break;
-            }
-        }
-    }
-
     D11PRINT("Allocated %#"PRIpaddr"-%#"PRIpaddr" (%ldMB/%ldMB, order %d)\n",
              start, start + size,
              1UL << (order + PAGE_SHIFT - 20),
@@ -293,7 +279,13 @@ fail:
 }
 
 /*
- * This is all pretty horrible.
+ * Allocate NUMA-aware memory for Dom0 with 1:1 mapping.
+ *
+ * This function distributes the requested Dom0 memory across the allowed
+ * physical NUMA nodes in a balanced manner. It implements a multi-pass
+ * scavenging loop to allow nodes to dynamically back up each other if a
+ * particular node runs out of memory, maintaining a balanced distribution
+ * while ensuring the maximum amount of requested memory is satisfied.
  *
  * Requirements:
  *
@@ -308,155 +300,187 @@ fail:
  *    below 4GB, so that it can be used by non-LPAE enabled kernels (32-bit).
  * 4. Some devices assigned to dom0 can only do 32-bit DMA access or
  *    even be more restricted. We want to allocate as much of the RAM
- *    as we reasonably can that can be accessed from all the devices..
+ *    as we reasonably can that can be accessed from all the devices.
  * 5. For 32-bit dom0 the kernel must be located below 4GB.
- * 6. We want to have a few largers banks rather than many smaller ones.
+ * 6. We want to have a few larger banks rather than many smaller ones.
  *
  * For the first two requirements we need to make sure that the lowest
- * bank is sufficiently large.
- *
- * For convenience we also sort the banks by physical address.
- *
- * The memory allocator does not really give us the flexibility to
- * meet these requirements directly. So instead of proceed as follows:
- *
- * We first allocate the largest allocation we can as low as we
- * can. This then becomes the first bank. This bank must be at least
- * 128MB (or memory size requested for domain if that is smaller).
+ * bank (Bank 0) is sufficiently large to hold all boot modules.
  *
- * Then we start allocating more memory, trying to allocate the
- * largest possible size and trying smaller sizes until we
- * successfully allocate something.
+ * The memory allocator does not really give us the flexibility to meet
+ * these requirements directly under NUMA topologies. So instead we proceed
+ * as follows:
  *
- * We then try and insert this memory in to the list of banks. If it
- * can be merged into an existing bank then this is trivial.
+ * We first calculate the total size required for the kernel, ramdisk, and
+ * DTB to establish a safe minimum size constraint for the first bank (Bank 0).
  *
- * If the new memory is before the first bank (and cannot be merged into it)
- * and is at least 128M then we allow it, otherwise we give up. Since the
- * allocator prefers to allocate high addresses first and the first bank has
- * already been allocated to be as low as possible this likely means we
- * wouldn't have been able to allocate much more memory anyway.
+ * We then enter a multi-pass outer loop that runs until the full memory
+ * request is met. In each pass, we dynamically calculate the target allocation
+ * amount for each remaining active node to ensure a balanced distribution.
  *
- * Otherwise we insert a new bank. If we've reached MAX_NR_BANKS then
- * we give up.
- *
- * For 32-bit domain we require that the initial allocation for the
- * first bank is part of the low mem. For 64-bit, the first bank is preferred
- * to be allocated in the low mem. Then for subsequent allocation, we
- * initially allocate memory only from low mem. Once that runs out out
- * (as described above) we allow higher allocations and continue until
- * that runs out (or we have allocated sufficient dom0 memory).
+ * For the initial chunk (Bank 0), we try to allocate the largest possible size
+ * as low as possible, honoring the 32-bit lowmem/DMA constraints. If it fails
+ * to find lowmem space and the domain is 64-bit, it falls back to highmem
+ * without violating the minimum size needed for the boot modules.
  */
 static void __init allocate_memory_11(struct domain *d,
                                       struct kernel_info *kinfo)
 {
-    const unsigned int min_low_order =
-        get_order_from_bytes(min_t(paddr_t, kinfo->unassigned_mem, MB(128)));
-    const unsigned int min_order = get_order_from_bytes(MB(4));
+    paddr_t todo = kinfo->unassigned_mem;
+    nodeid_t node;
+
+    unsigned int max_chunk_order = get_order_from_bytes(MB(128));
+    unsigned int min_bank0_order;
+    unsigned int lowmem_bitsize = arch_get_dma_bitsize();
+    bool is_bank0 = true;
+
+    struct boot_module *kernel_mod  = boot_module_find_by_kind(BOOTMOD_KERNEL);
+    struct boot_module *ramdisk_mod = 
boot_module_find_by_kind(BOOTMOD_RAMDISK);
+    struct boot_module *dtb_mod     = boot_module_find_by_kind(BOOTMOD_FDT);
+    paddr_t required_size = 0;
+
+    nodemask_t exhausted_nodes;
+    nodemask_t valid_nodes;
+
     struct membanks *mem = kernel_info_get_mem(kinfo);
-    struct page_info *pg;
-    unsigned int order = get_allocation_size(kinfo->unassigned_mem);
     unsigned int i;
 
-    bool lowmem = true;
-    unsigned int lowmem_bitsize = min(32U, arch_get_dma_bitsize());
-    unsigned int bits;
-
     /*
      * TODO: Implement memory bank allocation when DOM0 is not direct
      * mapped
      */
     BUG_ON(!is_domain_direct_mapped(d));
 
-    printk("Allocating 1:1 mappings totalling %ldMB for %pd:\n",
+    printk("Allocating 1:1 mappings totalling %ldMB for dom0:\n",
            /* Don't want format this as PRIpaddr (16 digit hex) */
-           (unsigned long)(kinfo->unassigned_mem >> 20), d);
+           (unsigned long)(kinfo->unassigned_mem >> 20));
 
     mem->nr_banks = 0;
 
     /*
-     * First try and allocate the largest thing we can as low as
-     * possible to be bank 0.
+     * Calculate the absolute minimum size required to fit the kernel,
+     * initrd, and DTB inside Bank 0
      */
-    while ( order >= min_low_order )
-    {
-        for ( bits = order ; bits <= lowmem_bitsize; bits++ )
-        {
-            pg = alloc_domheap_pages(d, order, MEMF_bits(bits));
-            if ( pg != NULL )
-            {
-                if ( !insert_11_bank(d, kinfo, pg, order) )
-                    BUG(); /* Cannot fail for first bank */
+    if ( kernel_mod )
+        required_size += kernel_mod->size;
+    if ( ramdisk_mod )
+        required_size += ramdisk_mod->size;
+    if ( dtb_mod )
+        required_size += dtb_mod->size;
 
-                goto got_bank0;
-            }
-        }
-        order--;
-    }
-
-    /* Failed to allocate bank0 in the lowmem region. */
-    if ( is_32bit_domain(d) )
-        panic("Unable to allocate first memory bank\n");
+    min_bank0_order = get_order_from_bytes(required_size);
 
-    /* Try to allocate memory from above the lowmem region */
-    printk(XENLOG_INFO "No bank has been allocated below %u-bit.\n",
-           lowmem_bitsize);
-    lowmem = false;
+    nodes_clear(exhausted_nodes);
+    nodes_and(valid_nodes, d->node_affinity, node_online_map);
 
- got_bank0:
+    BUG_ON(nodes_empty(valid_nodes));
 
-    /*
-     * If we failed to allocate bank0 in the lowmem region,
-     * continue allocating from above the lowmem and fill in banks.
-     */
-    order = get_allocation_size(kinfo->unassigned_mem);
-    while ( kinfo->unassigned_mem && mem->nr_banks < mem->max_banks )
+    while ( todo > 0 )
     {
-        pg = alloc_domheap_pages(d, order,
-                                 lowmem ? MEMF_bits(lowmem_bitsize) : 0);
-        if ( !pg )
-        {
-            order --;
+        paddr_t last_todo = todo;
+        nodemask_t active_nodes;
+        unsigned int active_nodes_count;
+        unsigned int nodes_left;
 
-            if ( lowmem && order < min_low_order)
-            {
-                D11PRINT("Failed at min_low_order, allow high allocations\n");
-                order = get_allocation_size(kinfo->unassigned_mem);
-                lowmem = false;
-                continue;
-            }
-            if ( order >= min_order )
-                continue;
+        /* Filter out exhausted nodes to find active candidates */
+        nodes_andnot(active_nodes, valid_nodes, exhausted_nodes);
+        active_nodes_count = nodes_weight(active_nodes);
 
-            /* No more we can do */
+        if ( active_nodes_count == 0 )
+        {
+            printk(XENLOG_WARNING "Dom0 NUMA: All specified nodes are 
completely exhausted.\n");
             break;
         }
 
-        if ( !insert_11_bank(d, kinfo, pg, order) )
+        nodes_left = active_nodes_count;
+
+        for_each_node_mask(node, active_nodes)
         {
-            if ( mem->nr_banks == mem->max_banks )
-                /* Nothing more we can do. */
-                break;
+            paddr_t target_per_node;
+            paddr_t node_todo;
 
-            if ( lowmem )
-            {
-                D11PRINT("Allocation below bank 0, allow high allocations\n");
-                order = get_allocation_size(kinfo->unassigned_mem);
-                lowmem = false;
-                continue;
-            }
-            else
+            /* Target chunk size per node */
+            target_per_node = DIV_ROUND_UP(todo, nodes_left);
+            target_per_node = DIV_ROUND_UP(target_per_node, MB(128)) * MB(128);
+
+            node_todo = min(todo, target_per_node);
+
+            while ( node_todo > 0 )
             {
-                D11PRINT("Allocation below bank 0\n");
-                break;
+                struct page_info *pg = NULL;
+                unsigned int max_order = get_allocation_size(node_todo);
+                unsigned int order;
+                paddr_t bank_size;
+
+                /*
+                 * Enforce a maximum chunk cap of 128MB for all allocations
+                 * except Bank 0
+                 */
+                if ( !is_bank0 && max_order > max_chunk_order )
+                    max_order = max_chunk_order;
+
+                for ( order = max_order; ; order-- )
+                {
+                    unsigned int memflags = MEMF_node(node);
+                    if ( !dom0_affinity_relaxed )
+                        memflags |= MEMF_exact_node;
+
+                    if ( is_bank0 )
+                    {
+                        unsigned int bits;
+                        for ( bits = order; bits <= lowmem_bitsize; bits++ )
+                        {
+                            pg = alloc_domheap_pages(d, order, memflags | 
MEMF_bits(bits));
+                            if ( pg != NULL )
+                                break;
+                        }
+
+                        if ( !pg && order <= min_bank0_order )
+                        {
+                            if ( is_32bit_domain(d) )
+                                panic("Unable to allocate first memory bank 
below %u-bit\n", lowmem_bitsize);
+
+                            pg = alloc_domheap_pages(d, order, memflags);
+                        }
+                    }
+                    else
+                    {
+                        pg = alloc_domheap_pages(d, order, memflags);
+                    }
+
+                    if ( pg || order == 0 )
+                        break;
+                }
+
+                if ( !pg )
+                {
+                    node_set(node, exhausted_nodes);
+                    break;
+                }
+
+                if ( is_bank0 )
+                    is_bank0 = false;
+
+                if ( !insert_11_bank(d, kinfo, pg, order, node) )
+                    break;
+
+                bank_size = 1ULL << (PAGE_SHIFT + order);
+                node_todo -= bank_size;
+                todo -= bank_size;
+
+                if ( todo == 0 ) break;
             }
+
+            nodes_left--;
+            if ( todo == 0 ) break;
         }
 
         /*
-         * Success, next time around try again to get the largest order
-         * allocation possible.
+         * Prevent infinite loop if a full pass across all active nodes
+         * yields zero progress
          */
-        order = get_allocation_size(kinfo->unassigned_mem);
+        if ( todo == last_todo )
+            break;
     }
 
     if ( kinfo->unassigned_mem )
@@ -464,14 +488,15 @@ static void __init allocate_memory_11(struct domain *d,
         panic("Failed to allocate requested dom0 memory. %ldMB unallocated\n",
               (unsigned long)kinfo->unassigned_mem >> 20);
 
-    for( i = 0; i < mem->nr_banks; i++ )
+    for ( i = 0; i < mem->nr_banks; i++ )
     {
-        printk("BANK[%d] %#"PRIpaddr"-%#"PRIpaddr" (%ldMB)\n",
+        printk("BANK[%d] %#"PRIpaddr"-%#"PRIpaddr" (%ldMB) NODE:%u\n",
                i,
                mem->bank[i].start,
                mem->bank[i].start + mem->bank[i].size,
                /* Don't want format this as PRIpaddr (16 digit hex) */
-               (unsigned long)(mem->bank[i].size >> 20));
+               (unsigned long)(mem->bank[i].size >> 20),
+               get_numa_nodeid(&mem->bank[i]));
     }
 }
 
-- 
2.43.0




 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.