[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH 4/4] hvm: NUMA guest: inject NUMA topology into the guest
This patch extends the hvm_info_table to store the number of guest nodes and will create a suitable ACPI SRAT table to describe the used guest NUMA topology. Signed-off-by: Andre Przywara <andre.przywara@xxxxxxx> Regards, Andre. -- Andre Przywara AMD-Operating System Research Center (OSRC), Dresden, Germany Tel: +49 351 277-84917 ----to satisfy European Law for business letters: AMD Saxony Limited Liability Company & Co. KG, Wilschdorfer Landstr. 101, 01109 Dresden, Germany Register Court Dresden: HRA 4896, General Partner authorized to represent: AMD Saxony LLC (Wilmington, Delaware, US) General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy # HG changeset patch # User Andre Przywara <andre.przywara@xxxxxxx> # Date 1215084035 -7200 # Node ID aa69281c1ecf288c729a9fb5aaab1fa0983072bb # Parent b84c5f2fe83bd7c94ed956ba412689e614177f5c advertise NUMA topology to the guest (via an ACPI table) diff -r b84c5f2fe83b -r aa69281c1ecf tools/firmware/hvmloader/acpi/acpi2_0.h --- a/tools/firmware/hvmloader/acpi/acpi2_0.h Thu Jul 03 13:17:11 2008 +0200 +++ b/tools/firmware/hvmloader/acpi/acpi2_0.h Thu Jul 03 13:20:35 2008 +0200 @@ -356,6 +356,61 @@ }; /* + * System Resource Affinity Table header definition (SRAT). + */ +struct acpi_20_srat { + struct acpi_header header; + uint32_t table_revision; + uint32_t reserved2[2]; +}; + +#define ACPI_SRAT_TABLE_REVISION 1 + +/* + * System Resource Affinity Table structure types. + */ +#define ACPI_PROCESSOR_AFFIN 0x00 +#define ACPI_MEMORY_AFFIN 0x01 + +struct acpi_20_srat_processor { + uint8_t type; + uint8_t length; + uint8_t domain; + uint8_t apic_id; + uint32_t flags; + uint8_t sapic_id; + uint8_t domain_hi[3]; + uint32_t reserved; +}; + +/* + * Local APIC Affinity Flags. All other bits are reserved and must be 0. + */ +#define ACPI_LOCAL_APIC_AFFIN_ENABLED (1 << 0) + +struct acpi_20_srat_memory { + uint8_t type; + uint8_t length; + uint8_t domain; + uint8_t domain_hi[3]; /* this is ACPI 3.0, reserved in 2.0 */ + uint16_t reserved; + uint32_t base_address_lo; + uint32_t base_address_hi; + uint32_t length_lo; + uint32_t length_hi; + uint32_t reserved2; + uint32_t flags; + uint32_t reserved3[2]; +}; + +/* + * Memory Affinity Flags. All other bits are reserved and must be 0. + */ +#define ACPI_MEM_AFFIN_ENABLED (1 << 0) +#define ACPI_MEM_AFFIN_HOTPLUGGABLE (1 << 1) +#define ACPI_MEM_AFFIN_NONVOLATILE (1 << 2) /* this is ACPI 3.0 */ + +/* * Table Signatures. */ #define ACPI_2_0_RSDP_SIGNATURE ASCII64('R','S','D',' ','P','T','R',' ') @@ -366,6 +421,7 @@ #define ACPI_2_0_XSDT_SIGNATURE ASCII32('X','S','D','T') #define ACPI_2_0_TCPA_SIGNATURE ASCII32('T','C','P','A') #define ACPI_2_0_HPET_SIGNATURE ASCII32('H','P','E','T') +#define ACPI_2_0_SRAT_SIGNATURE ASCII32('S','R','A','T') /* * Table revision numbers. @@ -378,6 +434,7 @@ #define ACPI_2_0_TCPA_REVISION 0x02 #define ACPI_2_0_HPET_REVISION 0x01 #define ACPI_1_0_FADT_REVISION 0x01 +#define ACPI_2_0_SRAT_REVISION 0x01 #pragma pack () diff -r b84c5f2fe83b -r aa69281c1ecf tools/firmware/hvmloader/acpi/build.c --- a/tools/firmware/hvmloader/acpi/build.c Thu Jul 03 13:17:11 2008 +0200 +++ b/tools/firmware/hvmloader/acpi/build.c Thu Jul 03 13:20:35 2008 +0200 @@ -20,6 +20,9 @@ #include "ssdt_tpm.h" #include "../config.h" #include "../util.h" +#include "../e820.h" + +#define ONEMB 0x100000 #define align16(sz) (((sz) + 15) & ~15) #define fixed_strcpy(d, s) strncpy((d), (s), sizeof(d)) @@ -45,6 +48,140 @@ p = table; p[checksum_offset] = -sum; +} + +static int vcpu_to_numa_node (int vcpu_id, int nr_vcpus) +{ +int div,mod; + + div=nr_vcpus / get_numanodes(); + mod=nr_vcpus % get_numanodes(); + + if ( vcpu_id < mod * (div + 1)) return vcpu_id / (div + 1); + return ( ( vcpu_id - (mod * (div + 1)) ) / div ) + mod; +} + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +static uint64_t guessmemsize (void) +{ + uint64_t ret = 0; + struct e820entry *map = HVM_E820; + int i; + + for ( i = 0; i < *HVM_E820_NR ; i++) + { + if (map[i].addr == ONEMB ) + ret+=map[i].size + PAGE_SIZE * 3 + ONEMB; + if (map[i].addr == (1ULL << 32)) + ret+=map[i].size; + } + return ret; +} + +int construct_srat(struct acpi_20_srat *srat) +{ + struct acpi_20_srat_processor *processor; + struct acpi_20_srat_memory *memory; + struct e820entry *map = HVM_E820; + int i, offset = 0; + uint64_t hvm_node_mem; + + memset(srat, 0, sizeof(*srat)); + srat->header.signature = ACPI_2_0_SRAT_SIGNATURE; + srat->header.revision = ACPI_2_0_SRAT_REVISION; + fixed_strcpy(srat->header.oem_id, ACPI_OEM_ID); + fixed_strcpy(srat->header.oem_table_id, ACPI_OEM_TABLE_ID); + srat->header.oem_revision = ACPI_OEM_REVISION; + srat->header.creator_id = ACPI_CREATOR_ID; + srat->header.creator_revision = ACPI_CREATOR_REVISION; + srat->table_revision = ACPI_SRAT_TABLE_REVISION; + offset += sizeof(*srat); + + processor = (struct acpi_20_srat_processor *)(srat + 1); + for ( i = 0; i < get_vcpu_nr(); i++ ) + { + memset(processor, 0, sizeof(*processor)); + processor->type = ACPI_PROCESSOR_AFFIN; + processor->length = sizeof(*processor); + processor->domain = vcpu_to_numa_node (i, get_vcpu_nr()); + processor->apic_id = LAPIC_ID(i); + processor->flags = ACPI_LOCAL_APIC_AFFIN_ENABLED; + processor->sapic_id= 0; + offset += sizeof(*processor); + processor++; + } + + /* + * Equally distribute the memory on all NUMA nodes. Round up the size + * of available memory to whole megabytes, as (at least) Linux cannot cope + * with uneven NUMA node boundaries. The remaining part of memory will be + * assigned to the last NUMA node. The mapping of the first MB is copied + * from the E820 map and assigned to node 0 + */ + hvm_node_mem = guessmemsize()+ONEMB-1; + hvm_node_mem = hvm_node_mem >> 20; + /* 64bit/32bit does not work because of missing libgcc */ + hvm_node_mem = (uint32_t)hvm_node_mem / get_numanodes(); + hvm_node_mem = hvm_node_mem << 20; + + memory = (struct acpi_20_srat_memory *)(processor); + for ( i = 0; i < *HVM_E820_NR; i++ ) + { + if ( map[i].type != E820_RAM ) continue; + if ( map[i].addr >= ONEMB ) break; + + memset(memory, 0, sizeof(*memory)); + memory->type = ACPI_MEMORY_AFFIN; + memory->length = sizeof(*memory); + memory->domain = 0; + memory->base_address_lo = map[i].addr & 0xFFFFFFFFL; + memory->base_address_hi = map[i].addr >> 32; + memory->length_lo = map[i].size & 0xFFFFFFFFL; + memory->length_hi = map[i].size >> 32; + memory->flags = ACPI_MEM_AFFIN_ENABLED; + + offset += sizeof(*memory); + memory++; + } + + for ( i = 0; i < get_numanodes(); i++ ) + { + memset(memory, 0, sizeof(*memory)); + memory->type = ACPI_MEMORY_AFFIN; + memory->length = sizeof(*memory); + memory->domain = i; + if ( i == 0 ) + { + memory->base_address_lo = ONEMB; + memory->base_address_hi = 0; + memory->length_lo = ( hvm_node_mem - ONEMB ) & 0xFFFFFFFFL; + memory->length_hi = ( hvm_node_mem - ONEMB ) >> 32; + } else + if ( i == get_numanodes()-1 ) + { + memory->base_address_lo = (i * hvm_node_mem) & 0xFFFFFFFFL; + memory->base_address_hi = (i * hvm_node_mem) >> 32; + memory->length_lo = (guessmemsize()-hvm_node_mem*i) & 0xFFFFFFFFL; + memory->length_hi = (guessmemsize()-hvm_node_mem*i) >> 32; + } else + { + memory->base_address_lo = (i * hvm_node_mem) & 0xFFFFFFFFL; + memory->base_address_hi = (i * hvm_node_mem) >> 32; + memory->length_lo = hvm_node_mem & 0xFFFFFFFFL; + memory->length_hi = hvm_node_mem >> 32; + } + memory->flags = ACPI_MEM_AFFIN_ENABLED; + offset += sizeof(*memory); + memory++; + } + + srat->header.length = offset; + set_checksum(srat, offsetof(struct acpi_header, checksum), offset); + + return align16(offset); } static int uart_exists(uint16_t uart_base) @@ -192,6 +329,7 @@ static int construct_secondary_tables(uint8_t *buf, unsigned long *table_ptrs) { int offset = 0, nr_tables = 0; + struct acpi_20_srat *srat; struct acpi_20_madt *madt; struct acpi_20_hpet *hpet; struct acpi_20_tcpa *tcpa; @@ -204,6 +342,14 @@ madt = (struct acpi_20_madt *)&buf[offset]; offset += construct_madt(madt); table_ptrs[nr_tables++] = (unsigned long)madt; + } + + /* SRAT. */ + if ( get_numanodes() > 0 ) + { + srat = (struct acpi_20_srat *)&buf[offset]; + offset += construct_srat(srat); + table_ptrs[nr_tables++] = (unsigned long)srat; } /* HPET. */ diff -r b84c5f2fe83b -r aa69281c1ecf tools/firmware/hvmloader/util.c --- a/tools/firmware/hvmloader/util.c Thu Jul 03 13:17:11 2008 +0200 +++ b/tools/firmware/hvmloader/util.c Thu Jul 03 13:20:35 2008 +0200 @@ -594,6 +594,12 @@ return (t ? t->nr_vcpus : 1); } +int get_numanodes(void) +{ + struct hvm_info_table *t = get_hvm_info_table(); + return (t ? t->numanodes : 1); +} + int get_acpi_enabled(void) { struct hvm_info_table *t = get_hvm_info_table(); diff -r b84c5f2fe83b -r aa69281c1ecf tools/firmware/hvmloader/util.h --- a/tools/firmware/hvmloader/util.h Thu Jul 03 13:17:11 2008 +0200 +++ b/tools/firmware/hvmloader/util.h Thu Jul 03 13:20:35 2008 +0200 @@ -104,6 +104,7 @@ /* HVM-builder info. */ int get_vcpu_nr(void); +int get_numanodes(void); int get_acpi_enabled(void); int get_apic_mode(void); diff -r b84c5f2fe83b -r aa69281c1ecf tools/python/xen/lowlevel/xc/xc.c --- a/tools/python/xen/lowlevel/xc/xc.c Thu Jul 03 13:17:11 2008 +0200 +++ b/tools/python/xen/lowlevel/xc/xc.c Thu Jul 03 13:20:35 2008 +0200 @@ -845,6 +845,18 @@ #endif /* __i386__ || __x86_64__ */ +static unsigned hweight_long (unsigned long value) +{ +int ret=0; + + while (value>0) + { + if (value&1) ++ret; + value>>=1; + } + return ret; +} + static PyObject *pyxc_hvm_build(XcObject *self, PyObject *args, PyObject *kwds) @@ -884,6 +896,7 @@ va_hvm->acpi_enabled = acpi; va_hvm->apic_mode = apic; va_hvm->nr_vcpus = vcpus; + va_hvm->numanodes = hweight_long(nodemask); for ( i = 0, sum = 0; i < va_hvm->length; i++ ) sum += ((uint8_t *)va_hvm)[i]; va_hvm->checksum = -sum; diff -r b84c5f2fe83b -r aa69281c1ecf xen/include/public/hvm/hvm_info_table.h --- a/xen/include/public/hvm/hvm_info_table.h Thu Jul 03 13:17:11 2008 +0200 +++ b/xen/include/public/hvm/hvm_info_table.h Thu Jul 03 13:20:35 2008 +0200 @@ -36,6 +36,7 @@ uint8_t acpi_enabled; uint8_t apic_mode; uint32_t nr_vcpus; + uint32_t numanodes; }; #endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */ _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |