Xen project Mailing List

Re: [PATCH v3 5/6] xen/x86: move NUMA scan nodes codes from x86 to common

Date: Thu, 25 Aug 2022 14:50:14 +0200

Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=suse.com; dmarc=pass action=none header.from=suse.com; dkim=pass header.d=suse.com; arc=none

Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=zfcUUSmjuBQL2clpKn7sZv66CIzJz2MxVSwK4vScXJs=; b=RT7PO3d6kLotsKU1yNiUP/O4y9maf47mhu+mkVFR45/BQOAagQR0fuDUfTLMDpDXt6Y33VLeaXLvLH/aAVJO2oh0TZsUdEJAPvvX7iMQBGCnbvK3D0ZWRZHsShXM/B4KshkIsIvwus08vsw08ERPjAGYcBBmzmku3IASutVcKDYrV+Y6ySl0SlN0vFOuF8F9htHenOP6a4e0K+1EjbeZlZtLB4Fp4eAkthJwLyEs17xBp0Pt1rf2niqwO4Esw2mt8DS+s2vGblzdYcoZIttFutJ+Q+X10KNR3P9MQBtvy0TkuecouS+WRzSQacDqd5kzALzm/g18PNQWqW0Lw7SVDA==

Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=jeG4ufI1MJuAiKqhK60mWctCCis9JRc8VIM1x2JmxyJDGgS2KKy3qVFM1bbvnE0PK+13Bliw4jg/VK/Gok1sGyT7s+TDudFMV/Q9cn9A58CbwgSnbKwJwtCEJp2hRAdfx5XT6gtQbvofgVlEByUnOhTeMg32pspKmAaqHuI+GdsobuJTerGh1LqZi6LMMl8ZOSY0mLcoO+IJ2AXd3hBuf63qeQZKzSAzQtfataNWC/EK34e4ow4erqaTQLPJHpZH0RNbocsM5lWc931v0c+KunWNhdbYkPnDaNni1fSprOYOZvRNTmNtMtwWv//eNOlBtxG4nl6WiB5dvuSBGMOPKg==

Authentication-results: dkim=none (message not signed) header.d=none;dmarc=none action=none header.from=suse.com;

Cc: nd@xxxxxxx, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Roger Pau Monné <roger.pau@xxxxxxxxxx>, Wei Liu <wl@xxxxxxx>, George Dunlap <george.dunlap@xxxxxxxxxx>, Julien Grall <julien@xxxxxxx>, Stefano Stabellini <sstabellini@xxxxxxxxxx>, xen-devel@xxxxxxxxxxxxxxxxxxxx

Delivery-date: Thu, 25 Aug 2022 12:50:41 +0000

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

On 22.08.2022 04:58, Wei Chen wrote: > --- a/xen/common/numa.c > +++ b/xen/common/numa.c > @@ -13,6 +13,21 @@ > #include <xen/sched.h> > #include <xen/softirq.h> > > +static nodemask_t __initdata processor_nodes_parsed; > +static nodemask_t __initdata memory_nodes_parsed; > +static struct node __initdata nodes[MAX_NUMNODES]; > + > +static int __ro_after_init num_node_memblks; unsigned int? > @@ -36,6 +51,308 @@ bool numa_disabled(void) > return numa_off || arch_numa_disabled(false); > } > > +void __init numa_set_processor_nodes_parsed(nodeid_t node) > +{ > + node_set(node, processor_nodes_parsed); > +} > + > +unsigned int valid_numa_range(paddr_t start, paddr_t end, nodeid_t node) bool (and then true/false below)? > +{ > + unsigned int i; > + > + for ( i = 0; i < num_node_memblks; i++ ) > + { > + struct node *nd = &node_memblk_range[i]; const? > + if ( nd->start <= start && nd->end >= end && > + memblk_nodeid[i] == node ) > + return 1; > + } > + > + return 0; > +} > + > +static > +enum conflicts __init conflicting_memblks(nodeid_t nid, paddr_t start, May I ask that you re-flow this to either static enum conflicts __init conflicting_memblks(nodeid_t nid, paddr_t start, or static enum conflicts __init conflicting_memblks( nodeid_t nid, paddr_t start, ? > + paddr_t end, paddr_t nd_start, > + paddr_t nd_end, unsigned int > *mblkid) > +{ > + unsigned int i; > + > + /* > + * Scan all recorded nodes' memory blocks to check conflicts: > + * Overlap or interleave. > + */ > + for ( i = 0; i < num_node_memblks; i++ ) > + { > + struct node *nd = &node_memblk_range[i]; const? > +bool __init numa_memblks_available(void) > +{ > + return num_node_memblks < NR_NODE_MEMBLKS; > +} This is kind of clumsy, but I have no better suggestion. > +/* > + * This function will be called by NUMA memory affinity initialization to > + * update NUMA node's memory range. In this function, we assume all memory > + * regions belonging to a single node are in one chunk. Holes (or MMIO > + * ranges) between them will be included in the node. > + * > + * So in numa_update_node_memblks, if there are multiple banks for each > + * node, start and end are stretched to cover the holes between them, and > + * it works as long as memory banks of different NUMA nodes don't interleave. > + */ > +int __init numa_update_node_memblks(nodeid_t node, unsigned int arch_nid, The function only ever returns 0 or -EINVAL - please consider switching to "bool" return type. > + paddr_t start, paddr_t size, > + const char *prefix, > + bool hotplug) > +{ > + unsigned int i; > + paddr_t end = start + size; > + paddr_t nd_start = start; > + paddr_t nd_end = end; > + struct node *nd = &nodes[node]; > + > + /* > + * For the node that already has some memory blocks, we will > + * expand the node memory range temporarily to check memory > + * interleaves with other nodes. We will not use this node > + * temp memory range to check overlaps, because it will mask > + * the overlaps in same node. > + * > + * Node with 0 bytes memory doesn't need this expandsion. > + */ > + if ( nd->start != nd->end ) > + { > + if ( nd_start > nd->start ) > + nd_start = nd->start; > + > + if ( nd_end < nd->end ) > + nd_end = nd->end; > + } > + > + /* It is fine to add this area to the nodes data it will be used later*/ Please adjust style here. > + switch ( conflicting_memblks(node, start, end, nd_start, nd_end, &i) ) > + { > + case OVERLAP: > + if ( memblk_nodeid[i] == node ) > + { > + bool mismatch = !(hotplug) != !test_bit(i, memblk_hotplug); > + > + printk("%sNUMA: %s %u [%"PRIpaddr", %"PRIpaddr"] overlaps with > itself [%"PRIpaddr", %"PRIpaddr"]\n", > + mismatch ? KERN_ERR : KERN_WARNING, prefix, > + arch_nid, start, end - 1, > + node_memblk_range[i].start, node_memblk_range[i].end - 1); > + if ( mismatch ) > + return -EINVAL; > + break; > + } > + > + printk(KERN_ERR > + "NUMA: %s %u [%"PRIpaddr", %"PRIpaddr"] overlaps with %s %u > [%"PRIpaddr", %"PRIpaddr"]\n", > + prefix, arch_nid, start, end - 1, prefix, > + numa_node_to_arch_nid(memblk_nodeid[i]), > + node_memblk_range[i].start, node_memblk_range[i].end - 1); > + return -EINVAL; > + > + > + case INTERLEAVE: > + printk(KERN_ERR > + "NUMA： %s %u: [%"PRIpaddr", %"PRIpaddr"] interleaves with %s > %u memblk [%"PRIpaddr", %"PRIpaddr"]\n", > + prefix, arch_nid, nd_start, nd_end - 1, > + prefix, numa_node_to_arch_nid(memblk_nodeid[i]), > + node_memblk_range[i].start, node_memblk_range[i].end - 1); > + return -EINVAL; > + > + case NO_CONFLICT: > + break; > + } > + > + if ( !hotplug ) > + { > + node_set(node, memory_nodes_parsed); > + nd->start = nd_start; > + nd->end = nd_end; > + } > + > + if ( strcasecmp("Node", prefix) ) > + printk(KERN_INFO "NUMA: Node %u %s %u [%"PRIpaddr", > %"PRIpaddr"]%s\n", > + node, prefix, arch_nid, start, end - 1, > + hotplug ? " (hotplug)" : ""); > + else > + printk(KERN_INFO "NUMA: Node %u [%"PRIpaddr", %"PRIpaddr"]%s\n", > + node, start, end - 1, hotplug ? " (hotplug)" : ""); Hmm, if I'm not mistaken one of the two printk()s and hence also one of the two format strings will be dead code / data on every archiecture. I wonder if we don't want to have a HAS_NUMA_FW_NODE_ID (name subject to improvment) Kconfig setting to avoid such. I could imagine this to become relevant also in other code. > +static int __init numa_scan_nodes(paddr_t start, paddr_t end) This function returns only 0 or -1, i.e. is even more of a candidate for having "bool" return type than numa_update_node_memblks(). > +{ > + unsigned int i; > + nodemask_t all_nodes_parsed; > + > + /* First clean up the node list */ > + for ( i = 0; i < MAX_NUMNODES; i++ ) > + cutoff_node(i, start, end); > + > + /* When numa is on with good firmware, we can do numa scan nodes. */ > + if ( arch_numa_disabled(true) ) > + return -1; > + > + if ( !nodes_cover_memory() ) > + { > + numa_fw_bad(); > + return -1; > + } > + > + memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, > + memblk_nodeid); > + > + if ( memnode_shift < 0 ) > + { > + printk(KERN_ERR > + "NUMA: No NUMA node hash function found. Contact > maintainer\n"); > + numa_fw_bad(); > + return -1; > + } > + > + nodes_or(all_nodes_parsed, memory_nodes_parsed, processor_nodes_parsed); > + > + /* Finally register nodes */ > + for_each_node_mask( i, all_nodes_parsed ) > + { > + paddr_t size = nodes[i].end - nodes[i].start; > + > + if ( size == 0 ) > + printk(KERN_INFO "NUMA: node %u has no memory\n", i); > + > + setup_node_bootmem(i, nodes[i].start, nodes[i].end); > + } May I suggest to eliminate "size" at this occasion, for being used only once and rather not helping readability (imo at least)? > --- a/xen/include/xen/mm.h > +++ b/xen/include/xen/mm.h > @@ -58,6 +58,8 @@ > #include <xen/perfc.h> > #include <public/memory.h> > > +extern paddr_t mem_hotplug; > + > struct page_info; > > void put_page(struct page_info *); I'm sorry, but I guess this may go about anywhere in the file, but not right at the top. I would probably have put it ahead of npfec_kind_t or right after dom_cow. Jan

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.