x86: support up to 16Tb This mainly involves adjusting the number of L4 entries needing copying between page tables (which is now different between PV and HVM/idle domains), and changing the cutoff point and method when more than the supported amount of memory is found in a system. Since TMEM doesn't currently cope with the full 1:1 map not always being visible, it gets forcefully disabled in that case. Signed-off-by: Jan Beulich --- a/xen/arch/x86/efi/boot.c +++ b/xen/arch/x86/efi/boot.c @@ -1591,7 +1591,7 @@ void __init efi_init_memory(void) /* Insert Xen mappings. */ for ( i = l4_table_offset(HYPERVISOR_VIRT_START); - i < l4_table_offset(HYPERVISOR_VIRT_END); ++i ) + i < l4_table_offset(DIRECTMAP_VIRT_END); ++i ) efi_l4_pgtable[i] = idle_pg_table[i]; #endif } --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -1320,7 +1320,7 @@ void init_guest_l4_table(l4_pgentry_t l4 /* Xen private mappings. */ memcpy(&l4tab[ROOT_PAGETABLE_FIRST_XEN_SLOT], &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT], - ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t)); + ROOT_PAGETABLE_PV_XEN_SLOTS * sizeof(l4_pgentry_t)); l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] = l4e_from_pfn(domain_page_map_to_mfn(l4tab), __PAGE_HYPERVISOR); l4tab[l4_table_offset(PERDOMAIN_VIRT_START)] = --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -25,6 +25,7 @@ #include #include #include +#include /* for opt_tmem only */ #include #include #include @@ -381,6 +382,11 @@ static void __init setup_max_pdx(void) if ( max_pdx > FRAMETABLE_NR ) max_pdx = FRAMETABLE_NR; +#ifdef PAGE_LIST_NULL + if ( max_pdx >= PAGE_LIST_NULL ) + max_pdx = PAGE_LIST_NULL - 1; +#endif + max_page = pdx_to_pfn(max_pdx - 1) + 1; } @@ -1031,9 +1037,23 @@ void __init __start_xen(unsigned long mb /* Create new mappings /before/ passing memory to the allocator. */ if ( map_e < e ) { - map_pages_to_xen((unsigned long)__va(map_e), map_e >> PAGE_SHIFT, - (e - map_e) >> PAGE_SHIFT, PAGE_HYPERVISOR); - init_boot_pages(map_e, e); + uint64_t limit = __pa(HYPERVISOR_VIRT_END - 1) + 1; + uint64_t end = min(e, limit); + + if ( map_e < end ) + { + map_pages_to_xen((unsigned long)__va(map_e), PFN_DOWN(map_e), + PFN_DOWN(end - map_e), PAGE_HYPERVISOR); + init_boot_pages(map_e, end); + map_e = end; + } + } + if ( map_e < e ) + { + /* This range must not be passed to the boot allocator and + * must also not be mapped with _PAGE_GLOBAL. */ + map_pages_to_xen((unsigned long)__va(map_e), PFN_DOWN(map_e), + PFN_DOWN(e - map_e), __PAGE_HYPERVISOR); } if ( s < map_s ) { @@ -1104,6 +1124,34 @@ void __init __start_xen(unsigned long mb end_boot_allocator(); system_state = SYS_STATE_boot; + if ( max_page - 1 > virt_to_mfn(HYPERVISOR_VIRT_END - 1) ) + { + unsigned long limit = virt_to_mfn(HYPERVISOR_VIRT_END - 1); + uint64_t mask = PAGE_SIZE - 1; + + xenheap_max_mfn(limit); + + /* Pass the remaining memory to the allocator. */ + for ( i = 0; i < boot_e820.nr_map; i++ ) + { + uint64_t s, e; + + s = (boot_e820.map[i].addr + mask) & ~mask; + e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask; + if ( PFN_DOWN(e) <= limit ) + continue; + if ( PFN_DOWN(s) <= limit ) + s = pfn_to_paddr(limit + 1); + init_domheap_pages(s, e); + } + + if ( opt_tmem ) + { + printk(XENLOG_WARNING "Forcing TMEM off\n"); + opt_tmem = 0; + } + } + vm_init(); vesa_init(); --- a/xen/arch/x86/x86_64/mm.c +++ b/xen/arch/x86/x86_64/mm.c @@ -1471,10 +1471,23 @@ int memory_add(unsigned long spfn, unsig return -EINVAL; } - ret = map_pages_to_xen((unsigned long)mfn_to_virt(spfn), spfn, - epfn - spfn, PAGE_HYPERVISOR); - if ( ret ) - return ret; + i = virt_to_mfn(HYPERVISOR_VIRT_END - 1) + 1; + if ( spfn < i ) + { + ret = map_pages_to_xen((unsigned long)mfn_to_virt(spfn), spfn, + min(epfn, i) - spfn, PAGE_HYPERVISOR); + if ( ret ) + return ret; + } + if ( i < epfn ) + { + if ( i < spfn ) + i = spfn; + ret = map_pages_to_xen((unsigned long)mfn_to_virt(i), i, + epfn - i, __PAGE_HYPERVISOR); + if ( ret ) + return ret; + } old_node_start = NODE_DATA(node)->node_start_pfn; old_node_span = NODE_DATA(node)->node_spanned_pages; --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -255,6 +255,9 @@ static unsigned long init_node_heap(int unsigned long needed = (sizeof(**_heap) + sizeof(**avail) * NR_ZONES + PAGE_SIZE - 1) >> PAGE_SHIFT; +#ifdef DIRECTMAP_VIRT_END + unsigned long eva = min(DIRECTMAP_VIRT_END, HYPERVISOR_VIRT_END); +#endif int i, j; if ( !first_node_initialised ) @@ -266,14 +269,14 @@ static unsigned long init_node_heap(int } #ifdef DIRECTMAP_VIRT_END else if ( *use_tail && nr >= needed && - (mfn + nr) <= (virt_to_mfn(DIRECTMAP_VIRT_END - 1) + 1) ) + (mfn + nr) <= (virt_to_mfn(eva - 1) + 1) ) { _heap[node] = mfn_to_virt(mfn + nr - needed); avail[node] = mfn_to_virt(mfn + nr - 1) + PAGE_SIZE - sizeof(**avail) * NR_ZONES; } else if ( nr >= needed && - (mfn + needed) <= (virt_to_mfn(DIRECTMAP_VIRT_END - 1) + 1) ) + (mfn + needed) <= (virt_to_mfn(eva - 1) + 1) ) { _heap[node] = mfn_to_virt(mfn); avail[node] = mfn_to_virt(mfn + needed - 1) + @@ -1205,6 +1208,13 @@ void free_xenheap_pages(void *v, unsigne #else +static unsigned int __read_mostly xenheap_bits; + +void __init xenheap_max_mfn(unsigned long mfn) +{ + xenheap_bits = fls(mfn) + PAGE_SHIFT - 1; +} + void init_xenheap_pages(paddr_t ps, paddr_t pe) { init_domheap_pages(ps, pe); @@ -1217,6 +1227,11 @@ void *alloc_xenheap_pages(unsigned int o ASSERT(!in_irq()); + if ( xenheap_bits && (memflags >> _MEMF_bits) > xenheap_bits ) + memflags &= ~MEMF_bits(~0); + if ( !(memflags >> _MEMF_bits) ) + memflags |= MEMF_bits(xenheap_bits); + pg = alloc_domheap_pages(NULL, order, memflags); if ( unlikely(pg == NULL) ) return NULL; --- a/xen/include/asm-x86/config.h +++ b/xen/include/asm-x86/config.h @@ -163,8 +163,12 @@ extern unsigned char boot_edid_info[128] * Page-frame information array. * 0xffff830000000000 - 0xffff87ffffffffff [5TB, 5*2^40 bytes, PML4:262-271] * 1:1 direct mapping of all physical memory. - * 0xffff880000000000 - 0xffffffffffffffff [120TB, PML4:272-511] - * Guest-defined use. + * 0xffff880000000000 - 0xffffffffffffffff [120TB, PML4:272-511] + * PV: Guest-defined use. + * 0xffff880000000000 - 0xffffff7fffffffff [119.5TB, PML4:272-510] + * HVM/idle: continuation of 1:1 mapping + * 0xffffff8000000000 - 0xffffffffffffffff [512GB, 2^39 bytes PML4:511] + * HVM/idle: unused * * Compatibility guest area layout: * 0x0000000000000000 - 0x00000000f57fffff [3928MB, PML4:0] @@ -183,6 +187,8 @@ extern unsigned char boot_edid_info[128] #define ROOT_PAGETABLE_FIRST_XEN_SLOT 256 #define ROOT_PAGETABLE_LAST_XEN_SLOT 271 #define ROOT_PAGETABLE_XEN_SLOTS \ + (L4_PAGETABLE_ENTRIES - ROOT_PAGETABLE_FIRST_XEN_SLOT - 1) +#define ROOT_PAGETABLE_PV_XEN_SLOTS \ (ROOT_PAGETABLE_LAST_XEN_SLOT - ROOT_PAGETABLE_FIRST_XEN_SLOT + 1) /* Hypervisor reserves PML4 slots 256 to 271 inclusive. */ @@ -241,9 +247,9 @@ extern unsigned char boot_edid_info[128] #define FRAMETABLE_SIZE GB(128) #define FRAMETABLE_NR (FRAMETABLE_SIZE / sizeof(*frame_table)) #define FRAMETABLE_VIRT_START (FRAMETABLE_VIRT_END - FRAMETABLE_SIZE) -/* Slot 262-271: A direct 1:1 mapping of all of physical memory. */ +/* Slot 262-271/510: A direct 1:1 mapping of all of physical memory. */ #define DIRECTMAP_VIRT_START (PML4_ADDR(262)) -#define DIRECTMAP_SIZE (PML4_ENTRY_BYTES*10) +#define DIRECTMAP_SIZE (PML4_ENTRY_BYTES * (511 - 262)) #define DIRECTMAP_VIRT_END (DIRECTMAP_VIRT_START + DIRECTMAP_SIZE) #ifndef __ASSEMBLY__ --- a/xen/include/xen/mm.h +++ b/xen/include/xen/mm.h @@ -43,6 +43,7 @@ void end_boot_allocator(void); /* Xen suballocator. These functions are interrupt-safe. */ void init_xenheap_pages(paddr_t ps, paddr_t pe); +void xenheap_max_mfn(unsigned long mfn); void *alloc_xenheap_pages(unsigned int order, unsigned int memflags); void free_xenheap_pages(void *v, unsigned int order); #define alloc_xenheap_page() (alloc_xenheap_pages(0,0)) @@ -111,7 +112,7 @@ struct page_list_head /* These must only have instances in struct page_info. */ # define page_list_entry -#define PAGE_LIST_NULL (~0) +# define PAGE_LIST_NULL ((typeof(((struct page_info){}).list.next))~0) # if !defined(pdx_to_page) && !defined(page_to_pdx) # if defined(__page_to_mfn) || defined(__mfn_to_page)