[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [patch] PAE for xen
Hi, This patch adds initial support for PAE paging to xen. It's against cset 1.1442 (today in the morning). This patch does: * boot Xen itself with PAE paging enabled. * add PAE support to the dom0 domain builder. Not yet done (some details come with another mail): * fix hypercall interfaces to handle 64bit page table entries. * actually use memory above 4GB (depends on hypercall interface changes). * boot something else than domain 0. * shadow mode support. I'll submit xenlinux patches (hopefully) later this week or next week. Current state can be found @ http://dl.bytesex.org/patches/ Some notes on the design: * There are two new config options: CONFIG_X86_PAE (boolean, same name Linux uses to simply things) and CONFIG_PAGING_LEVELS (int, possible values are 2,3,4). I've used #if CONFIG_PAGING_LEVELS for stuff which simply depends on the number of paging levels in the code common for x86-32/64, and CONFIG_X86_PAE for special PAE quirks or i386-only stuff. I've tried to avoid ifdefs if possible though, often I rearranged code to make it work in both PAE and non-PAE case instead. * idle_pg_table: 3rd level is statically initialized, 2nd level is contignous in physical and virtual memory, so it can be addressed linear (the dom0 builder uses the same trick to simplify things a bit btw.). There are two new symbols: idle_pg_table_l3 and idle_pg_table_l2 for the two tables. idle_pg_table is aliased to the toplevel page table, i.e. idle_pg_table_l3 in PAE mode and idle_pg_table_l2 in non-pae mode. The idle l3 table is actually never ever touched after boot, the l2 table is accessed via idle_pg_table_l2 and addressed linear in both PAE and non-PAE mode. please apply. comments & questions are welcome. Gerd PS: Some bits in this patch are from Scott Parish <srparish@xxxxxxxxxx> Signed-off-by: Gerd Knorr <kraxel@xxxxxxxxxxx> Index: xen/include/asm-x86/config.h =================================================================== --- xen.orig/include/asm-x86/config.h 2005-05-13 12:37:10.000000000 +0200 +++ xen/include/asm-x86/config.h 2005-05-13 12:58:42.000000000 +0200 @@ -9,6 +9,19 @@ #define CONFIG_VMX 1 +#if defined(__i386__) +// # define CONFIG_X86_PAE 1 /* yes */ +# undef CONFIG_X86_PAE /* no */ +#endif + +#if defined(__x86_64) +# define CONFIG_PAGING_LEVELS 4 +#elif defined(CONFIG_X86_PAE) +# define CONFIG_PAGING_LEVELS 3 +#else +# define CONFIG_PAGING_LEVELS 2 +#endif + #define CONFIG_X86 1 #define CONFIG_SHADOW 1 @@ -194,7 +207,7 @@ extern unsigned long _end; /* standard E * Per-domain mappings ( 4MB) * Shadow linear pagetable ( 4MB) ( 8MB) * Guest linear pagetable ( 4MB) ( 8MB) - * Machine-to-physical translation table [writable] ( 4MB) + * Machine-to-physical translation table [writable] ( 4MB) (16MB) * Frame-info table (24MB) (96MB) * * Start of guest inaccessible area * Machine-to-physical translation table [read-only] ( 4MB) @@ -208,8 +221,8 @@ extern unsigned long _end; /* standard E #ifdef CONFIG_X86_PAE # define LINEARPT_MBYTES 8 -# define MACHPHYS_MBYTES 4 /* KAF: This needs to be bigger */ -# define FRAMETABLE_MBYTES 96 /* 16 GB mem limit (total) */ +# define MACHPHYS_MBYTES 16 /* 1 MB needed per 1 GB memory */ +# define FRAMETABLE_MBYTES (MACHPHYS_MBYTES * 6) #else # define LINEARPT_MBYTES 4 # define MACHPHYS_MBYTES 4 @@ -242,21 +255,21 @@ extern unsigned long _end; /* standard E #define GUEST_SEGMENT_MAX_ADDR RO_MPT_VIRT_END #ifdef CONFIG_X86_PAE -/* Hypervisor owns top 144MB of virtual address space. */ -# define __HYPERVISOR_VIRT_START 0xF7000000 -# define HYPERVISOR_VIRT_START (0xF7000000UL) +/* Hypervisor owns top 168MB of virtual address space. */ +# define __HYPERVISOR_VIRT_START 0xF5800000 +# define HYPERVISOR_VIRT_START (0xF5800000UL) #else /* Hypervisor owns top 64MB of virtual address space. */ # define __HYPERVISOR_VIRT_START 0xFC000000 # define HYPERVISOR_VIRT_START (0xFC000000UL) #endif -#define ROOT_PAGETABLE_FIRST_XEN_SLOT \ +#define L2_PAGETABLE_FIRST_XEN_SLOT \ (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) -#define ROOT_PAGETABLE_LAST_XEN_SLOT \ +#define L2_PAGETABLE_LAST_XEN_SLOT \ (~0UL >> L2_PAGETABLE_SHIFT) -#define ROOT_PAGETABLE_XEN_SLOTS \ - (ROOT_PAGETABLE_LAST_XEN_SLOT - ROOT_PAGETABLE_FIRST_XEN_SLOT + 1) +#define L2_PAGETABLE_XEN_SLOTS \ + (L2_PAGETABLE_LAST_XEN_SLOT - L2_PAGETABLE_FIRST_XEN_SLOT + 1) #define PGT_base_page_table PGT_l2_page_table Index: xen/arch/x86/setup.c =================================================================== --- xen.orig/arch/x86/setup.c 2005-05-13 12:37:10.000000000 +0200 +++ xen/arch/x86/setup.c 2005-05-13 12:37:42.000000000 +0200 @@ -70,7 +70,7 @@ extern int do_timer_lists_from_pit; struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1 }; -#if defined(__x86_64__) +#if CONFIG_PAGING_LEVELS > 2 unsigned long mmu_cr4_features = X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE; #else unsigned long mmu_cr4_features = X86_CR4_PSE | X86_CR4_PGE; Index: xen/arch/x86/domain_build.c =================================================================== --- xen.orig/arch/x86/domain_build.c 2005-05-13 12:37:10.000000000 +0200 +++ xen/arch/x86/domain_build.c 2005-05-13 12:37:42.000000000 +0200 @@ -44,15 +44,15 @@ boolean_param("dom0_translate", opt_dom0 #if defined(__i386__) /* No ring-3 access in initial leaf page tables. */ #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED) +#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) +#define L3_PROT (_PAGE_PRESENT) #elif defined(__x86_64__) /* Allow ring-3 access in long mode as guest cannot use ring 1. */ #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER) -#endif -/* Don't change these: Linux expects just these bits to be set. */ -/* (And that includes the bogus _PAGE_DIRTY!) */ #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) #define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) #define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) +#endif #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK) #define round_pgdown(_p) ((_p)&PAGE_MASK) @@ -91,7 +91,11 @@ int construct_dom0(struct domain *d, #elif defined(__x86_64__) char *image_start = __va(_image_start); char *initrd_start = __va(_initrd_start); +#endif +#if CONFIG_PAGING_LEVELS >= 4 l4_pgentry_t *l4tab = NULL, *l4start = NULL; +#endif +#if CONFIG_PAGING_LEVELS >= 3 l3_pgentry_t *l3tab = NULL, *l3start = NULL; #endif l2_pgentry_t *l2tab = NULL, *l2start = NULL; @@ -172,10 +176,15 @@ int construct_dom0(struct domain *d, v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1); if ( (v_end - vstack_end) < (512UL << 10) ) v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */ -#if defined(__i386__) +#if defined(__i386__) && !defined(CONFIG_X86_PAE) if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >> L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages ) break; +#elif defined(__i386__) && defined(CONFIG_X86_PAE) + /* 5 pages: 1x 3rd + 4x 2nd level */ + if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >> + L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages ) + break; #elif defined(__x86_64__) #define NR(_l,_h,_s) \ (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \ @@ -252,6 +261,24 @@ int construct_dom0(struct domain *d, } /* WARNING: The new domain must have its 'processor' field filled in! */ +#if CONFIG_PAGING_LEVELS == 3 + l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE; + l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE; + memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE); + for (i = 0; i < 4; i++) { + l3tab[i] = l3e_create_phys((u32)l2tab + i*PAGE_SIZE, L3_PROT); + l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] = + l2e_create_phys((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR); + } + unsigned long v; + for (v = PERDOMAIN_VIRT_START; v < PERDOMAIN_VIRT_END; + v += (1 << L2_PAGETABLE_SHIFT)) { + l2tab[v >> L2_PAGETABLE_SHIFT] = + l2e_create_phys(__pa(d->arch.mm_perdomain_pt) + (v-PERDOMAIN_VIRT_START), + __PAGE_HYPERVISOR); + } + ed->arch.guest_table = mk_pagetable((unsigned long)l3start); +#else l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE; memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE); l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] = @@ -259,8 +286,9 @@ int construct_dom0(struct domain *d, l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] = l2e_create_phys(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR); ed->arch.guest_table = mk_pagetable((unsigned long)l2start); +#endif - l2tab += l2_table_offset(dsi.v_start); + l2tab += l2_linear_offset(dsi.v_start); mfn = alloc_start >> PAGE_SHIFT; for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ ) { @@ -285,8 +313,8 @@ int construct_dom0(struct domain *d, } /* Pages that are part of page tables must be read only. */ - l2tab = l2start + l2_table_offset(vpt_start); - l1start = l1tab = (l1_pgentry_t *)l2e_get_phys(*l2tab); + l2tab = l2start + l2_linear_offset(vpt_start); + l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_phys(*l2tab); l1tab += l1_table_offset(vpt_start); for ( count = 0; count < nr_pt_pages; count++ ) { @@ -297,6 +325,34 @@ int construct_dom0(struct domain *d, if ( !get_page_type(page, PGT_writable_page) ) BUG(); +#if CONFIG_PAGING_LEVELS == 3 + switch (count) { + case 0: + page->u.inuse.type_info &= ~PGT_type_mask; + page->u.inuse.type_info |= PGT_l3_page_table; + get_page(page, d); /* an extra ref because of readable mapping */ + + /* Get another ref to L3 page so that it can be pinned. */ + if ( !get_page_and_type(page, d, PGT_l3_page_table) ) + BUG(); + set_bit(_PGT_pinned, &page->u.inuse.type_info); + break; + case 1 ... 4: + page->u.inuse.type_info &= ~PGT_type_mask; + page->u.inuse.type_info |= PGT_l2_page_table; + page->u.inuse.type_info |= + (count-1) << PGT_va_shift; + get_page(page, d); /* an extra ref because of readable mapping */ + break; + default: + page->u.inuse.type_info &= ~PGT_type_mask; + page->u.inuse.type_info |= PGT_l1_page_table; + page->u.inuse.type_info |= + ((dsi.v_start>>L2_PAGETABLE_SHIFT)+(count-5))<<PGT_va_shift; + get_page(page, d); /* an extra ref because of readable mapping */ + break; + } +#else if ( count == 0 ) { page->u.inuse.type_info &= ~PGT_type_mask; @@ -329,8 +385,9 @@ int construct_dom0(struct domain *d, */ get_page(page, d); /* an extra ref because of readable mapping */ } +#endif if ( !((unsigned long)++l1tab & (PAGE_SIZE - 1)) ) - l1start = l1tab = (l1_pgentry_t *)l2e_get_phys(*++l2tab); + l1start = l1tab = (l1_pgentry_t *)(u32)l2e_get_phys(*++l2tab); } #elif defined(__x86_64__) @@ -541,10 +598,8 @@ int construct_dom0(struct domain *d, #if defined(__i386__) /* Destroy low mappings - they were only for our convenience. */ - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) - if ( l2e_get_flags(l2start[i]) & _PAGE_PSE ) - l2start[i] = l2e_empty(); - zap_low_mappings(); /* Do the same for the idle page tables. */ + zap_low_mappings(l2start); + zap_low_mappings(idle_pg_table_l2); #endif /* DOM0 gets access to everything. */ @@ -561,6 +616,12 @@ int construct_dom0(struct domain *d, : SHM_enable)); if ( opt_dom0_translate ) { +#if defined(__i386__) && defined(CONFIG_X86_PAE) + printk("FIXME: PAE code needed here: %s:%d (%s)\n", + __FILE__, __LINE__, __FUNCTION__); + for ( ; ; ) + __asm__ __volatile__ ( "hlt" ); +#else /* Hmm, what does this? Looks like isn't portable across 32/64 bit and pae/non-pae ... -- kraxel */ @@ -583,6 +644,7 @@ int construct_dom0(struct domain *d, pagetable_get_pfn(ed->arch.guest_table)); idle_pg_table[1] = root_empty(); local_flush_tlb(); +#endif } update_pagetables(ed); /* XXX SMP */ Index: xen/include/asm-x86/page.h =================================================================== --- xen.orig/include/asm-x86/page.h 2005-05-13 12:37:10.000000000 +0200 +++ xen/include/asm-x86/page.h 2005-05-13 12:47:49.000000000 +0200 @@ -9,10 +9,14 @@ #endif #define PAGE_MASK (~(PAGE_SIZE-1)) +#ifndef __ASSEMBLY__ +# include <asm/types.h> +#endif + #if defined(__i386__) -#include <asm/x86_32/page.h> +# include <asm/x86_32/page.h> #elif defined(__x86_64__) -#include <asm/x86_64/page.h> +# include <asm/x86_64/page.h> #endif /* Convert a pointer to a page-table entry into pagetable slot index. */ @@ -21,9 +25,18 @@ /* Page-table type. */ #ifndef __ASSEMBLY__ -typedef struct { unsigned long pt_lo; } pagetable_t; -#define pagetable_val(_x) ((_x).pt_lo) -#define pagetable_get_pfn(_x) ((_x).pt_lo >> PAGE_SHIFT) +#if CONFIG_PAGING_LEVELS == 2 +/* x86_32 default */ +typedef struct { u32 pt; } pagetable_t; +#elif CONFIG_PAGING_LEVELS == 3 +/* x86_32 PAE */ +typedef struct { u32 pt; } pagetable_t; /* FIXME */ +#elif CONFIG_PAGING_LEVELS == 4 +/* x86_64 */ +typedef struct { u64 pt; } pagetable_t; +#endif +#define pagetable_val(_x) ((_x).pt) +#define pagetable_get_pfn(_x) ((_x).pt >> PAGE_SHIFT) #define mk_pagetable(_x) ( (pagetable_t) { (_x) } ) #endif @@ -39,6 +52,7 @@ typedef struct { unsigned long pt_lo; } #define pfn_valid(_pfn) ((_pfn) < max_page) /* High table entries are reserved by the hypervisor. */ +/* FIXME: this breaks with PAE -- kraxel */ #define DOMAIN_ENTRIES_PER_L2_PAGETABLE \ (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) #define HYPERVISOR_ENTRIES_PER_L2_PAGETABLE \ @@ -73,7 +87,14 @@ typedef struct { unsigned long pt_lo; } #define va_to_l1mfn(_ed, _va) \ (l2e_get_pfn(linear_l2_table(_ed)[_va>>L2_PAGETABLE_SHIFT])) +#if CONFIG_PAGING_LEVELS == 3 extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES]; +extern l3_pgentry_t idle_pg_table_l3[ROOT_PAGETABLE_ENTRIES]; +extern l2_pgentry_t idle_pg_table_l2[ROOT_PAGETABLE_ENTRIES*L2_PAGETABLE_ENTRIES]; +#else +extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES]; +extern l2_pgentry_t idle_pg_table_l2[ROOT_PAGETABLE_ENTRIES]; +#endif extern void paging_init(void); @@ -131,6 +152,8 @@ static __inline__ int get_order(unsigned return order; } +extern void printk_page_flags(u32 flags); + /* Map physical byte range (@p, @p+@s) at virt address @v in pagetable @pt. */ extern int map_pages( Index: xen/include/asm-x86/x86_32/page-2l.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ xen/include/asm-x86/x86_32/page-2l.h 2005-05-13 12:37:42.000000000 +0200 @@ -0,0 +1,109 @@ +#ifndef __X86_32_PAGE_2L_H__ +#define __X86_32_PAGE_2L_H__ + +#define L1_PAGETABLE_SHIFT 12 +#define L2_PAGETABLE_SHIFT 22 +#define PAGE_SHIFT L1_PAGETABLE_SHIFT +#define ROOT_PAGETABLE_SHIFT L2_PAGETABLE_SHIFT + +#define PAGETABLE_ORDER 10 +#define L1_PAGETABLE_ENTRIES (1<<PAGETABLE_ORDER) +#define L2_PAGETABLE_ENTRIES (1<<PAGETABLE_ORDER) +#define ROOT_PAGETABLE_ENTRIES L2_PAGETABLE_ENTRIES + +#define PADDR_BITS 32 +#define PADDR_MASK (~0UL) + +#ifndef __ASSEMBLY__ +#include <asm/types.h> +typedef struct { u32 l1_lo; } l1_pgentry_t; +typedef struct { u32 l2_lo; } l2_pgentry_t; +typedef l2_pgentry_t root_pgentry_t; + +/* read access (depricated) */ +#define l1e_get_value(_x) ((u32)(_x).l1_lo) +#define l2e_get_value(_x) ((u32)(_x).l2_lo) + +/* read access */ +#define l1e_get_pfn(_x) ((u32)((_x).l1_lo >> PAGE_SHIFT)) +#define l1e_get_phys(_x) ((u32)((_x).l1_lo & PAGE_MASK)) +#define l1e_get_flags(_x) ((u32)((_x).l1_lo & ~PAGE_MASK)) + +#define l2e_get_pfn(_x) ((u32)((_x).l2_lo >> PAGE_SHIFT)) +#define l2e_get_phys(_x) ((u32)((_x).l2_lo & PAGE_MASK)) +#define l2e_get_flags(_x) ((u32)((_x).l2_lo & ~PAGE_MASK)) + +/* write access */ +static inline l1_pgentry_t l1e_empty(void) +{ + l1_pgentry_t e = { .l1_lo = 0 }; + return e; +} +static inline l1_pgentry_t l1e_create_pfn(u32 pfn, u32 flags) +{ + l1_pgentry_t e = { .l1_lo = (pfn << PAGE_SHIFT) | flags }; + return e; +} +static inline l1_pgentry_t l1e_create_phys(u32 addr, u32 flags) +{ + l1_pgentry_t e = { .l1_lo = (addr & PAGE_MASK) | flags }; + return e; +} +static inline void l1e_add_flags(l1_pgentry_t *e, u32 flags) +{ + e->l1_lo |= flags; +} +static inline void l1e_remove_flags(l1_pgentry_t *e, u32 flags) +{ + e->l1_lo &= ~flags; +} + +static inline l2_pgentry_t l2e_empty(void) +{ + l2_pgentry_t e = { .l2_lo = 0 }; + return e; +} +static inline l2_pgentry_t l2e_create_pfn(u32 pfn, u32 flags) +{ + l2_pgentry_t e = { .l2_lo = (pfn << PAGE_SHIFT) | flags }; + return e; +} +static inline l2_pgentry_t l2e_create_phys(u32 addr, u32 flags) +{ + l2_pgentry_t e = { .l2_lo = (addr & PAGE_MASK) | flags }; + return e; +} +static inline void l2e_add_flags(l2_pgentry_t *e, u32 flags) +{ + e->l2_lo |= flags; +} +static inline void l2e_remove_flags(l2_pgentry_t *e, u32 flags) +{ + e->l2_lo &= ~flags; +} + +/* check entries */ +static inline int l1e_has_changed(l1_pgentry_t *e1, l1_pgentry_t *e2, u32 flags) +{ + return ((e1->l1_lo ^ e2->l1_lo) & (PAGE_MASK | flags)) != 0; +} +static inline int l2e_has_changed(l2_pgentry_t *e1, l2_pgentry_t *e2, u32 flags) +{ + return ((e1->l2_lo ^ e2->l2_lo) & (PAGE_MASK | flags)) != 0; +} + +#endif /* !__ASSEMBLY__ */ + +/* root table */ +#define root_get_pfn l2e_get_pfn +#define root_get_flags l2e_get_flags +#define root_get_value l2e_get_value +#define root_empty l2e_empty +#define root_create_phys l2e_create_phys +#define PGT_root_page_table PGT_l2_page_table + +/* misc */ +#define is_guest_l1_slot(_s) (1) +#define is_guest_l2_slot(_t,_s) ((_s) < L2_PAGETABLE_FIRST_XEN_SLOT) + +#endif /* __X86_32_PAGE_2L_H__ */ Index: xen/include/asm-x86/x86_32/page-3l.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ xen/include/asm-x86/x86_32/page-3l.h 2005-05-13 14:02:31.000000000 +0200 @@ -0,0 +1,204 @@ +#ifndef __X86_32_PAGE_3L_H__ +#define __X86_32_PAGE_3L_H__ + +#define L1_PAGETABLE_SHIFT 12 +#define L2_PAGETABLE_SHIFT 21 +#define L3_PAGETABLE_SHIFT 30 +#define PAGE_SHIFT L1_PAGETABLE_SHIFT +#define ROOT_PAGETABLE_SHIFT L3_PAGETABLE_SHIFT + +#define PAGETABLE_ORDER 9 +#define L1_PAGETABLE_ENTRIES (1<<PAGETABLE_ORDER) +#define L2_PAGETABLE_ENTRIES (1<<PAGETABLE_ORDER) +#define L3_PAGETABLE_ENTRIES 4 +#define ROOT_PAGETABLE_ENTRIES L3_PAGETABLE_ENTRIES + +#define PADDR_BITS 32 /* FIXME */ +#define PADDR_MASK (~0UL) /* FIXME */ + +#ifndef __ASSEMBLY__ +#include <asm/types.h> +typedef struct { u32 l1_lo; u32 l1_hi; } l1_pgentry_t; +typedef struct { u32 l2_lo; u32 l2_hi; } l2_pgentry_t; +typedef struct { u32 l3_lo; u32 l3_hi; } l3_pgentry_t; +typedef l3_pgentry_t root_pgentry_t; + +/* read access (depricated) */ +static inline u64 l1e_get_value(l1_pgentry_t x) +{ + return ((u64)x.l1_lo | (u64)x.l1_hi << 32); +} +static inline u64 l2e_get_value(l2_pgentry_t x) +{ + return ((u64)x.l2_lo | (u64)x.l2_hi << 32); +} +static inline u64 l3e_get_value(l3_pgentry_t x) +{ + return ((u64)x.l3_lo | (u64)x.l3_hi << 32); +} + + +/* read access */ +static inline unsigned long l1e_get_pfn(l1_pgentry_t x) +{ + return (((x.l1_hi & 0x0fULL) << (32-PAGE_SHIFT)) | + (x.l1_lo >> PAGE_SHIFT)); +} +static inline u64 l1e_get_phys(l1_pgentry_t x) +{ + return ((((u64)x.l1_hi & 0x0fULL) << 32) | + ((u64)x.l1_lo & PAGE_MASK)); +} +static inline unsigned long l1e_get_flags(l1_pgentry_t x) +{ + return (x.l1_lo & ~PAGE_MASK); +} + +static inline unsigned long l2e_get_pfn(l2_pgentry_t x) +{ + return (((x.l2_hi & 0x0fULL) << (32-PAGE_SHIFT)) | + (x.l2_lo >> PAGE_SHIFT)); +} +static inline u64 l2e_get_phys(l2_pgentry_t x) +{ + return ((((u64)x.l2_hi & 0x0fULL) << 32) | + ((u64)x.l2_lo & PAGE_MASK)); +} +static inline unsigned long l2e_get_flags(l2_pgentry_t x) +{ + return (x.l2_lo & ~PAGE_MASK); +} + +static inline unsigned long l3e_get_pfn(l3_pgentry_t x) +{ + return (((x.l3_hi & 0x0fULL) << (32-PAGE_SHIFT)) | + (x.l3_lo >> PAGE_SHIFT)); +} +static inline u64 l3e_get_phys(l3_pgentry_t x) +{ + return ((((u64)x.l3_hi & 0x0fULL) << 32) | + ((u64)x.l3_lo & PAGE_MASK)); +} +static inline unsigned long l3e_get_flags(l3_pgentry_t x) +{ + return (x.l3_lo & ~PAGE_MASK); +} + + +/* write access */ +static inline l1_pgentry_t l1e_empty(void) +{ + l1_pgentry_t e = { .l1_hi = 0, + .l1_lo = 0 }; + return e; +} +static inline l1_pgentry_t l1e_create_pfn(u32 pfn, u32 flags) +{ + l1_pgentry_t e = { .l1_hi = (pfn >> (32-PAGE_SHIFT)) & 0x0f, + .l1_lo = (pfn << PAGE_SHIFT) | flags }; + return e; +} +static inline l1_pgentry_t l1e_create_phys(u64 addr, u32 flags) +{ + l1_pgentry_t e = { .l1_hi = (u32)((addr >> 32) & 0x0f), + .l1_lo = (u32)((addr & PAGE_MASK)) | flags }; + return e; +} +static inline void l1e_add_flags(l1_pgentry_t *e, u32 flags) +{ + e->l1_lo |= flags; +} +static inline void l1e_remove_flags(l1_pgentry_t *e, u32 flags) +{ + e->l1_lo &= ~flags; +} + +static inline l2_pgentry_t l2e_empty(void) +{ + l2_pgentry_t e = { .l2_hi = 0, + .l2_lo = 0 }; + return e; +} +static inline l2_pgentry_t l2e_create_pfn(u32 pfn, u32 flags) +{ + l2_pgentry_t e = { .l2_hi = (pfn >> (32-PAGE_SHIFT)) & 0x0f, + .l2_lo = (pfn << PAGE_SHIFT) | flags }; + return e; +} +static inline l2_pgentry_t l2e_create_phys(u64 addr, u32 flags) +{ + l2_pgentry_t e = { .l2_hi = (u32)((addr >> 32) & 0x0f), + .l2_lo = (u32)((addr & PAGE_MASK)) | flags }; + return e; +} +static inline void l2e_add_flags(l2_pgentry_t *e, u32 flags) +{ + e->l2_lo |= flags; +} +static inline void l2e_remove_flags(l2_pgentry_t *e, u32 flags) +{ + e->l2_lo &= ~flags; +} + +static inline l3_pgentry_t l3e_empty(void) +{ + l3_pgentry_t e = { .l3_hi = 0, + .l3_lo = 0 }; + return e; +} +static inline l3_pgentry_t l3e_create_pfn(u32 pfn, u32 flags) +{ + l3_pgentry_t e = { .l3_hi = (pfn >> (32-PAGE_SHIFT)) & 0x0f, + .l3_lo = (pfn << PAGE_SHIFT) | flags }; + return e; +} +static inline l3_pgentry_t l3e_create_phys(u64 addr, u32 flags) +{ + l3_pgentry_t e = { .l3_hi = (u32)((addr >> 32) & 0x0f), + .l3_lo = (u32)((addr & PAGE_MASK)) | flags }; + return e; +} +static inline void l3e_add_flags(l3_pgentry_t *e, u32 flags) +{ + e->l3_lo |= flags; +} +static inline void l3e_remove_flags(l3_pgentry_t *e, u32 flags) +{ + e->l3_lo &= ~flags; +} + +/* check entries */ +static inline int l1e_has_changed(l1_pgentry_t *e1, l1_pgentry_t *e2, u32 flags) +{ + return ( ((e1->l1_lo ^ e2->l1_lo) & (PAGE_MASK | flags)) != 0 || + ((e1->l1_hi ^ e2->l1_hi) & 0x0f ) != 0 ); +} +static inline int l2e_has_changed(l2_pgentry_t *e1, l2_pgentry_t *e2, u32 flags) +{ + return ( ((e1->l2_lo ^ e2->l2_lo) & (PAGE_MASK | flags)) != 0 || + ((e1->l2_hi ^ e2->l2_hi) & 0x0f ) != 0 ); +} +static inline int l3e_has_changed(l3_pgentry_t *e1, l3_pgentry_t *e2, u32 flags) +{ + return ( ((e1->l3_lo ^ e2->l3_lo) & (PAGE_MASK | flags)) != 0 || + ((e1->l3_hi ^ e2->l3_hi) & 0x0f ) != 0 ); +} + +#endif /* !__ASSEMBLY__ */ + +/* root table */ +#define root_get_pfn l3e_get_pfn +#define root_get_flags l3e_get_flags +#define root_get_value l3e_get_value +#define root_empty l3e_empty +#define root_init_phys l3e_create_phys +#define PGT_root_page_table PGT_l3_page_table + +/* misc */ +#define is_guest_l1_slot(_s) (1) +#define is_guest_l2_slot(_t,_s) \ + ((3 != (((_t) & PGT_va_mask) >> PGT_va_shift)) || \ + ((_s) < (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)))) +#define is_guest_l3_slot(_s) (1) + +#endif /* __X86_32_PAGE_3L_H__ */ Index: xen/arch/x86/boot/x86_32.S =================================================================== --- xen.orig/arch/x86/boot/x86_32.S 2005-05-13 12:37:09.000000000 +0200 +++ xen/arch/x86/boot/x86_32.S 2005-05-13 12:37:42.000000000 +0200 @@ -100,6 +100,22 @@ __start: xor %eax,%eax rep stosb +#ifdef CONFIG_X86_PAE + /* Initialize low and high mappings of all memory with 2MB pages */ + mov $idle_pg_table_l2-__PAGE_OFFSET,%edi + mov $0xe3,%eax /* PRESENT+RW+A+D+2MB */ +1: mov %eax,__PAGE_OFFSET>>18(%edi) /* high mapping */ + stosl /* low mapping */ + add $4,%edi + add $(1<<L2_PAGETABLE_SHIFT),%eax + cmp $DIRECTMAP_PHYS_END+0xe3,%eax + jne 1b +1: stosl /* low mappings cover as much physmem as possible */ + add $4,%edi + add $(1<<L2_PAGETABLE_SHIFT),%eax + cmp $__HYPERVISOR_VIRT_START+0xe3,%eax + jne 1b +#else /* Initialize low and high mappings of all memory with 4MB pages */ mov $idle_pg_table-__PAGE_OFFSET,%edi mov $0xe3,%eax /* PRESENT+RW+A+D+4MB */ @@ -112,6 +128,7 @@ __start: add $(1<<L2_PAGETABLE_SHIFT),%eax cmp $__HYPERVISOR_VIRT_START+0xe3,%eax jne 1b +#endif /* Initialise IDT with simple error defaults. */ lea ignore_int,%edx @@ -204,17 +221,47 @@ ENTRY(gdt_table) .quad 0x0000000000000000 /* unused */ .quad 0x00cf9a000000ffff /* 0x0808 ring 0 4.00GB code at 0x0 */ .quad 0x00cf92000000ffff /* 0x0810 ring 0 4.00GB data at 0x0 */ +#ifdef CONFIG_X86_PAE + .quad 0x00cfba00000067ff + .quad 0x00cfb200000067ff + .quad 0x00cffa00000067ff + .quad 0x00cff200000067ff +#else .quad 0x00cfba000000c3ff /* 0x0819 ring 1 3.95GB code at 0x0 */ .quad 0x00cfb2000000c3ff /* 0x0821 ring 1 3.95GB data at 0x0 */ .quad 0x00cffa000000c3ff /* 0x082b ring 3 3.95GB code at 0x0 */ .quad 0x00cff2000000c3ff /* 0x0833 ring 3 3.95GB data at 0x0 */ +#endif .quad 0x0000000000000000 /* unused */ .fill 2*NR_CPUS,8,0 /* space for TSS and LDT per CPU */ +#ifdef CONFIG_X86_PAE + .org 0x1000 -ENTRY(idle_pg_table) # Initial page directory is 4kB +ENTRY(idle_pg_table) +ENTRY(idle_pg_table_l3) + .quad 0x100000 + 0x2000 + 0x01 + .quad 0x100000 + 0x3000 + 0x01 + .quad 0x100000 + 0x4000 + 0x01 + .quad 0x100000 + 0x5000 + 0x01 + + .org 0x2000 +ENTRY(idle_pg_table_l2) + + .org 0x6000 +ENTRY(cpu0_stack) + .org 0x6000 + STACK_SIZE + +#else /* CONFIG_X86_PAE */ + + .org 0x1000 +ENTRY(idle_pg_table) +ENTRY(idle_pg_table_l2) # Initial page directory is 4kB .org 0x2000 ENTRY(cpu0_stack) .org 0x2000 + STACK_SIZE + +#endif /* CONFIG_X86_PAE */ + ENTRY(stext) ENTRY(_stext) Index: xen/arch/x86/mm.c =================================================================== --- xen.orig/arch/x86/mm.c 2005-05-13 12:37:10.000000000 +0200 +++ xen/arch/x86/mm.c 2005-05-13 12:49:06.000000000 +0200 @@ -121,7 +121,8 @@ static void free_l2_table(struct pfn_info *page); static void free_l1_table(struct pfn_info *page); -static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long); +static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long, + unsigned int type); static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t); /* Used to defer flushing of memory structures. */ @@ -371,7 +372,6 @@ static int get_page_and_type_from_pagenr return 1; } - /* * We allow root tables to map each other (a.k.a. linear page tables). It * needs some special care with reference counts and access permissions: @@ -428,7 +428,6 @@ get_linear_pagetable( return 1; } - int get_page_from_l1e( l1_pgentry_t l1e, struct domain *d) @@ -442,8 +441,9 @@ get_page_from_l1e( if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) ) { - MEM_LOG("Bad L1 type settings %lx %lx", l1e_get_value(l1e), - l1e_get_value(l1e) & L1_DISALLOW_MASK); + MEM_LOG("Bad L1 type settings %llx %llx", + (u64)l1e_get_value(l1e), + (u64)(l1e_get_value(l1e) & L1_DISALLOW_MASK)); return 0; } @@ -478,7 +478,7 @@ get_page_from_l1e( static int get_page_from_l2e( l2_pgentry_t l2e, unsigned long pfn, - struct domain *d, unsigned long va_idx) + struct domain *d, unsigned long vaddr) { int rc; @@ -489,45 +489,60 @@ get_page_from_l2e( if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) ) { - MEM_LOG("Bad L2 page type settings %lx", - l2e_get_value(l2e) & L2_DISALLOW_MASK); + MEM_LOG("Bad L2 page type settings %llx", + (u64)(l2e_get_value(l2e) & L2_DISALLOW_MASK)); return 0; } + vaddr >>= L2_PAGETABLE_SHIFT; + vaddr <<= PGT_va_shift; rc = get_page_and_type_from_pagenr( - l2e_get_pfn(l2e), - PGT_l1_page_table | (va_idx<<PGT_va_shift), d); + l2e_get_pfn(l2e), PGT_l1_page_table | vaddr, d); -#if defined(__i386__) - return rc ? rc : get_linear_pagetable(l2e, pfn, d); -#elif defined(__x86_64__) - return rc; +#if CONFIG_PAGING_LEVELS == 2 + if (!rc) + rc = get_linear_pagetable(l2e, pfn, d); #endif + return rc; } -#ifdef __x86_64__ +#if CONFIG_PAGING_LEVELS >= 3 static int get_page_from_l3e( - l3_pgentry_t l3e, unsigned long pfn, struct domain *d) + l3_pgentry_t l3e, unsigned long pfn, + struct domain *d, unsigned long vaddr) { ASSERT( !shadow_mode_refcounts(d) ); + int rc; + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) return 1; if ( unlikely((l3e_get_flags(l3e) & L3_DISALLOW_MASK)) ) { - MEM_LOG("Bad L3 page type settings %lx", - l3e_get_value(l3e) & L3_DISALLOW_MASK); + MEM_LOG("Bad L3 page type settings %llx", + (u64)(l3e_get_value(l3e) & L3_DISALLOW_MASK)); return 0; } - return get_page_and_type_from_pagenr( - l3e_get_pfn(l3e), PGT_l2_page_table, d); + vaddr >>= L3_PAGETABLE_SHIFT; + vaddr <<= PGT_va_shift; + rc = get_page_and_type_from_pagenr( + l3e_get_pfn(l3e), + PGT_l2_page_table | vaddr, d); +#if CONFIG_PAGING_LEVELS == 3 + if (!rc) + rc = get_linear_pagetable(l3e, pfn, d); +#endif + return rc; } +#endif /* 3 level */ + +#if CONFIG_PAGING_LEVELS >= 4 static int get_page_from_l4e( @@ -556,7 +571,7 @@ get_page_from_l4e( return 1; } -#endif /* __x86_64__ */ +#endif /* 4 level */ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) @@ -618,7 +633,7 @@ static void put_page_from_l2e(l2_pgentry } -#ifdef __x86_64__ +#if CONFIG_PAGING_LEVELS >= 3 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn) { @@ -627,6 +642,9 @@ static void put_page_from_l3e(l3_pgentry put_page_and_type(&frame_table[l3e_get_pfn(l3e)]); } +#endif + +#if CONFIG_PAGING_LEVELS >= 4 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn) { @@ -635,7 +653,7 @@ static void put_page_from_l4e(l4_pgentry put_page_and_type(&frame_table[l4e_get_pfn(l4e)]); } -#endif /* __x86_64__ */ +#endif static int alloc_l1_table(struct pfn_info *page) @@ -666,11 +684,58 @@ static int alloc_l1_table(struct pfn_inf return 0; } +#ifdef CONFIG_X86_PAE +static inline int fixup_pae_linear_mappings(l3_pgentry_t *pl3e) +{ + l2_pgentry_t *pl2e; + unsigned long vaddr; + int i,idx; -static int alloc_l2_table(struct pfn_info *page) + while ((unsigned long)pl3e & ~PAGE_MASK) + pl3e--; + + if (!(l3e_get_flags(pl3e[3]) & _PAGE_PRESENT)) { + printk("Installing a L3 PAE pt without L2 in slot #3 isn't going to fly ...\n"); + return 0; + } + + pl2e = map_domain_mem(l3e_get_phys(pl3e[3])); + for (i = 0; i < 4; i++) { + vaddr = LINEAR_PT_VIRT_START + (i << L2_PAGETABLE_SHIFT); + idx = (vaddr >> L2_PAGETABLE_SHIFT) & (L2_PAGETABLE_ENTRIES-1); + if (l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) { + pl2e[idx] = l2e_create_phys(l3e_get_phys(pl3e[i]), + __PAGE_HYPERVISOR); + } else + pl2e[idx] = l2e_empty(); + } + unmap_domain_mem(pl2e); + + return 1; +} + +static inline unsigned long fixup_pae_vaddr(unsigned long l2vaddr, + unsigned long l2type) +{ + unsigned long l3vaddr; + + if ((l2type & PGT_va_mask) == PGT_va_unknown) + BUG(); /* FIXME: do something more elegant here ... */ + l3vaddr = ((l2type & PGT_va_mask) >> PGT_va_shift) + << L3_PAGETABLE_SHIFT; + return l3vaddr + l2vaddr; +} + +#else +# define fixup_pae_linear_mappings(unused) (1) +# define fixup_pae_vaddr(vaddr, type) (vaddr) +#endif + +static int alloc_l2_table(struct pfn_info *page, unsigned int type) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_pfn(page); + unsigned long vaddr; l2_pgentry_t *pl2e; int i; @@ -678,34 +743,55 @@ static int alloc_l2_table(struct pfn_inf if ( (PGT_base_page_table == PGT_l2_page_table) && unlikely(shadow_mode_refcounts(d)) ) return 1; - ASSERT( !shadow_mode_refcounts(d) ); + pl2e = map_domain_mem(pfn << PAGE_SHIFT); - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - if ( is_guest_l2_slot(i) && - unlikely(!get_page_from_l2e(pl2e[i], pfn, d, i)) ) + for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) { + vaddr = i << L2_PAGETABLE_SHIFT; + vaddr = fixup_pae_vaddr(vaddr,type); + if ( is_guest_l2_slot(type, i) && + unlikely(!get_page_from_l2e(pl2e[i], pfn, d, vaddr)) ) goto fail; + } -#if defined(__i386__) +#if CONFIG_PAGING_LEVELS == 2 /* Xen private mappings. */ - memcpy(&pl2e[ROOT_PAGETABLE_FIRST_XEN_SLOT], - &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT], - ROOT_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t)); + memcpy(&pl2e[L2_PAGETABLE_FIRST_XEN_SLOT], + &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT], + L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t)); pl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_create_pfn(pfn, __PAGE_HYPERVISOR); pl2e[l2_table_offset(PERDOMAIN_VIRT_START)] = l2e_create_phys(__pa(page_get_owner(page)->arch.mm_perdomain_pt), __PAGE_HYPERVISOR); #endif +#if CONFIG_PAGING_LEVELS == 3 + if (3 == ((type & PGT_va_mask) >> PGT_va_shift)) { + unsigned long v,src,dst; + /* Xen private mappings. */ + dst = L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1); + src = L2_PAGETABLE_FIRST_XEN_SLOT; + memcpy(&pl2e[dst], &idle_pg_table_l2[src], + L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t)); + for (v = PERDOMAIN_VIRT_START; v < PERDOMAIN_VIRT_END; + v += (1 << L2_PAGETABLE_SHIFT)) { + dst = (v >> L2_PAGETABLE_SHIFT) & (L2_PAGETABLE_ENTRIES-1); + pl2e[dst] = + l2e_create_phys(__pa(d->arch.mm_perdomain_pt) + (v-PERDOMAIN_VIRT_START), + __PAGE_HYPERVISOR); + } + /* see fixup_pae_linear_mappings() for linear pagetables */ + } +#endif unmap_domain_mem(pl2e); return 1; fail: while ( i-- > 0 ) - if ( is_guest_l2_slot(i) ) + if ( is_guest_l2_slot(type, i) ) put_page_from_l2e(pl2e[i], pfn); unmap_domain_mem(pl2e); @@ -713,22 +799,29 @@ static int alloc_l2_table(struct pfn_inf } -#ifdef __x86_64__ +#if CONFIG_PAGING_LEVELS >= 3 static int alloc_l3_table(struct pfn_info *page) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_pfn(page); - l3_pgentry_t *pl3e = page_to_virt(page); + unsigned long vaddr; + l3_pgentry_t *pl3e; int i; ASSERT( !shadow_mode_refcounts(d) ); - for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) + pl3e = map_domain_mem(pfn << PAGE_SHIFT); + for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) { + vaddr = i << L3_PAGETABLE_SHIFT; if ( is_guest_l3_slot(i) && - unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) ) + unlikely(!get_page_from_l3e(pl3e[i], pfn, d, vaddr)) ) goto fail; + } + if (!fixup_pae_linear_mappings(pl3e)) + goto fail; + unmap_domain_mem(pl3e); return 1; fail: @@ -736,9 +829,13 @@ static int alloc_l3_table(struct pfn_inf if ( is_guest_l3_slot(i) ) put_page_from_l3e(pl3e[i], pfn); + unmap_domain_mem(pl3e); return 0; } +#endif + +#if CONFIG_PAGING_LEVELS >= 4 static int alloc_l4_table(struct pfn_info *page) { @@ -807,27 +904,35 @@ static void free_l2_table(struct pfn_inf pl2e = map_domain_mem(pfn << PAGE_SHIFT); - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - if ( is_guest_l2_slot(i) ) + for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) { + if ( is_guest_l2_slot(page->u.inuse.type_info, i) ) put_page_from_l2e(pl2e[i], pfn); + } unmap_domain_mem(pl2e); } -#ifdef __x86_64__ +#if CONFIG_PAGING_LEVELS >= 3 static void free_l3_table(struct pfn_info *page) { unsigned long pfn = page_to_pfn(page); - l3_pgentry_t *pl3e = page_to_virt(page); + l3_pgentry_t *pl3e; int i; + pl3e = map_domain_mem(pfn << PAGE_SHIFT); + for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) if ( is_guest_l3_slot(i) ) put_page_from_l3e(pl3e[i], pfn); + + unmap_domain_mem(pl3e); } +#endif + +#if CONFIG_PAGING_LEVELS >= 4 static void free_l4_table(struct pfn_info *page) { @@ -840,25 +945,29 @@ static void free_l4_table(struct pfn_inf put_page_from_l4e(pl4e[i], pfn); } -#endif /* __x86_64__ */ - +#endif static inline int update_l1e(l1_pgentry_t *pl1e, l1_pgentry_t ol1e, l1_pgentry_t nl1e) { - /* FIXME: breaks with PAE */ +#if defined(__i386__) && defined(CONFIG_X86_PAE) + u64 o = l1e_get_value(ol1e); + u64 n = l1e_get_value(nl1e); +#else unsigned long o = l1e_get_value(ol1e); unsigned long n = l1e_get_value(nl1e); +#endif if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) || unlikely(o != l1e_get_value(ol1e)) ) { - MEM_LOG("Failed to update %lx -> %lx: saw %lx", - l1e_get_value(ol1e), l1e_get_value(nl1e), o); + MEM_LOG("Failed to update %llx -> %llx: saw %llx", + (u64)l1e_get_value(ol1e), + (u64)l1e_get_value(nl1e), + (u64)o); return 0; } - return 1; } @@ -879,8 +988,8 @@ static int mod_l1_entry(l1_pgentry_t *pl { if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) ) { - MEM_LOG("Bad L1 type settings %lx", - l1e_get_value(nl1e) & L1_DISALLOW_MASK); + MEM_LOG("Bad L1 type settings %llx", + (u64)(l1e_get_value(nl1e) & L1_DISALLOW_MASK)); return 0; } @@ -913,19 +1022,23 @@ static int mod_l1_entry(l1_pgentry_t *pl _t ## e_get_value(_o), \ _t ## e_get_value(_n)); \ if ( __o != _t ## e_get_value(_o) ) \ - MEM_LOG("Failed to update %lx -> %lx: saw %lx", \ - _t ## e_get_value(_o), _t ## e_get_value(_n), __o); \ + MEM_LOG("Failed to update %llx -> %llx: saw %llx", \ + (u64)(_t ## e_get_value(_o)), \ + (u64)(_t ## e_get_value(_n)), \ + (u64)(__o)); \ (__o == _t ## e_get_value(_o)); }) /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */ static int mod_l2_entry(l2_pgentry_t *pl2e, l2_pgentry_t nl2e, - unsigned long pfn) + unsigned long pfn, + unsigned int type) { l2_pgentry_t ol2e; + unsigned long vaddr; - if ( unlikely(!is_guest_l2_slot(pgentry_ptr_to_slot(pl2e))) ) + if ( unlikely(!is_guest_l2_slot(type,pgentry_ptr_to_slot(pl2e))) ) { MEM_LOG("Illegal L2 update attempt in Xen-private area %p", pl2e); return 0; @@ -938,8 +1051,8 @@ static int mod_l2_entry(l2_pgentry_t *pl { if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) ) { - MEM_LOG("Bad L2 type settings %lx", - l2e_get_value(nl2e) & L2_DISALLOW_MASK); + MEM_LOG("Bad L2 type settings %llx", + (u64)(l2e_get_value(nl2e) & L2_DISALLOW_MASK)); return 0; } @@ -947,9 +1060,10 @@ static int mod_l2_entry(l2_pgentry_t *pl if ( !l2e_has_changed(&ol2e, &nl2e, _PAGE_PRESENT)) return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e); - if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, - ((unsigned long)pl2e & - ~PAGE_MASK) >> 2)) ) + vaddr = (((unsigned long)pl2e & ~PAGE_MASK) / sizeof(l2_pgentry_t)) + << L2_PAGETABLE_SHIFT; + vaddr = fixup_pae_vaddr(vaddr,type); + if ( unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) ) return 0; if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) ) @@ -969,7 +1083,7 @@ static int mod_l2_entry(l2_pgentry_t *pl } -#ifdef __x86_64__ +#if CONFIG_PAGING_LEVELS >= 3 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */ static int mod_l3_entry(l3_pgentry_t *pl3e, @@ -977,6 +1091,7 @@ static int mod_l3_entry(l3_pgentry_t *pl unsigned long pfn) { l3_pgentry_t ol3e; + unsigned long vaddr; if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) ) { @@ -991,8 +1106,8 @@ static int mod_l3_entry(l3_pgentry_t *pl { if ( unlikely(l3e_get_flags(nl3e) & L3_DISALLOW_MASK) ) { - MEM_LOG("Bad L3 type settings %lx", - l3e_get_value(nl3e) & L3_DISALLOW_MASK); + MEM_LOG("Bad L3 type settings %llx", + (u64)(l3e_get_value(nl3e) & L3_DISALLOW_MASK)); return 0; } @@ -1000,26 +1115,33 @@ static int mod_l3_entry(l3_pgentry_t *pl if (!l3e_has_changed(&ol3e, &nl3e, _PAGE_PRESENT)) return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e); - if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain)) ) + vaddr = (((unsigned long)pl3e & ~PAGE_MASK) / sizeof(l3_pgentry_t)) + << L3_PAGETABLE_SHIFT; + if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) ) return 0; - if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) ) + if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e) || + !fixup_pae_linear_mappings(pl3e)) ) { put_page_from_l3e(nl3e, pfn); return 0; } - + put_page_from_l3e(ol3e, pfn); return 1; } - if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) ) + if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e) || + !fixup_pae_linear_mappings(pl3e)) ) return 0; put_page_from_l3e(ol3e, pfn); return 1; } +#endif + +#if CONFIG_PAGING_LEVELS >= 4 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */ static int mod_l4_entry(l4_pgentry_t *pl4e, @@ -1070,20 +1192,21 @@ static int mod_l4_entry(l4_pgentry_t *pl return 1; } -#endif /* __x86_64__ */ - +#endif int alloc_page_type(struct pfn_info *page, unsigned int type) { - switch ( type ) + switch ( type & PGT_type_mask ) { case PGT_l1_page_table: return alloc_l1_table(page); case PGT_l2_page_table: - return alloc_l2_table(page); -#ifdef __x86_64__ + return alloc_l2_table(page, type); +#if CONFIG_PAGING_LEVELS >= 3 case PGT_l3_page_table: return alloc_l3_table(page); +#endif +#if CONFIG_PAGING_LEVELS >= 4 case PGT_l4_page_table: return alloc_l4_table(page); #endif @@ -1118,7 +1241,7 @@ void free_page_type(struct pfn_info *pag } } - switch ( type ) + switch (type & PGT_type_mask) { case PGT_l1_page_table: free_l1_table(page); @@ -1128,17 +1251,21 @@ void free_page_type(struct pfn_info *pag free_l2_table(page); break; -#ifdef __x86_64__ +#if CONFIG_PAGING_LEVELS >= 3 case PGT_l3_page_table: free_l3_table(page); break; +#endif +#if CONFIG_PAGING_LEVELS >= 4 case PGT_l4_page_table: free_l4_table(page); break; #endif default: + printk("%s: type %x pfn %lx\n",__FUNCTION__, + type, page_to_pfn(page)); BUG(); } } @@ -1181,7 +1308,7 @@ void put_page_type(struct pfn_info *page x & ~PGT_validated)) != x) ) goto again; /* We cleared the 'valid bit' so we do the clean up. */ - free_page_type(page, x & PGT_type_mask); + free_page_type(page, x); /* Carry on, but with the 'valid bit' now clear. */ x &= ~PGT_validated; nx &= ~PGT_validated; @@ -1264,6 +1391,10 @@ int get_page_type(struct pfn_info *page, /* This table is may be mapped at multiple locations. */ nx &= ~PGT_va_mask; nx |= PGT_va_unknown; +#if 0 /* debug */ + printk("%s: pfn %lx type %x -> %x (tag as unknown)\n", + __FUNCTION__,page_to_pfn(page),x,nx); +#endif } } if ( unlikely(!(x & PGT_validated)) ) @@ -1280,7 +1411,7 @@ int get_page_type(struct pfn_info *page, if ( unlikely(!(nx & PGT_validated)) ) { /* Try to validate page type; drop the new reference on failure. */ - if ( unlikely(!alloc_page_type(page, type & PGT_type_mask)) ) + if ( unlikely(!alloc_page_type(page, type)) ) { MEM_LOG("Error while validating pfn %lx for type %08x." " caf=%08x taf=%08x", @@ -1531,15 +1662,17 @@ int do_mmuext_op( type = PGT_l2_page_table; goto pin_page; -#ifdef __x86_64__ +#if CONFIG_PAGING_LEVELS >= 3 case MMUEXT_PIN_L3_TABLE: type = PGT_l3_page_table; goto pin_page; +#endif +#if CONFIG_PAGING_LEVELS >= 4 case MMUEXT_PIN_L4_TABLE: type = PGT_l4_page_table; goto pin_page; -#endif /* __x86_64__ */ +#endif case MMUEXT_UNPIN_TABLE: if ( unlikely(!(okay = get_page_from_pagenr(op.mfn, FOREIGNDOM))) ) @@ -1906,19 +2039,20 @@ int do_mmu_update( break; case PGT_l2_page_table: ASSERT( !shadow_mode_refcounts(d) ); - if ( likely(get_page_type(page, PGT_l2_page_table)) ) + if ( likely(get_page_type( + page, type_info & (PGT_type_mask|PGT_va_mask))) ) { l2_pgentry_t l2e; /* FIXME: doesn't work with PAE */ l2e = l2e_create_phys(req.val, req.val); - okay = mod_l2_entry(va, l2e, mfn); + okay = mod_l2_entry((l2_pgentry_t *)va, l2e, mfn, type_info); if ( okay && unlikely(shadow_mode_enabled(d)) ) shadow_l2_normal_pt_update(d, req.ptr, l2e, &sh_mapcache); put_page_type(page); } break; -#ifdef __x86_64__ +#if CONFIG_PAGING_LEVELS >= 3 case PGT_l3_page_table: ASSERT( !shadow_mode_refcounts(d) ); if ( likely(get_page_type(page, PGT_l3_page_table)) ) @@ -1933,6 +2067,8 @@ int do_mmu_update( put_page_type(page); } break; +#endif +#if CONFIG_PAGING_LEVELS >= 4 case PGT_l4_page_table: ASSERT( !shadow_mode_refcounts(d) ); if ( likely(get_page_type(page, PGT_l4_page_table)) ) @@ -1946,7 +2082,7 @@ int do_mmu_update( put_page_type(page); } break; -#endif /* __x86_64__ */ +#endif default: if ( likely(get_page_type(page, PGT_writable_page)) ) { @@ -2113,9 +2249,10 @@ int update_grant_va_mapping(unsigned lon int do_update_va_mapping(unsigned long va, - l1_pgentry_t val, + unsigned long val32, unsigned long flags) { + l1_pgentry_t val = l1e_create_phys(val32,val32); struct exec_domain *ed = current; struct domain *d = ed->domain; unsigned int cpu = ed->processor; @@ -2210,7 +2347,7 @@ int do_update_va_mapping(unsigned long v } int do_update_va_mapping_otherdomain(unsigned long va, - l1_pgentry_t val, + unsigned long val32, unsigned long flags, domid_t domid) { @@ -2228,7 +2365,7 @@ int do_update_va_mapping_otherdomain(uns return -ESRCH; } - rc = do_update_va_mapping(va, val, flags); + rc = do_update_va_mapping(va, val32, flags); return rc; } @@ -2582,8 +2719,8 @@ void ptwr_flush(struct domain *d, const static int ptwr_emulated_update( unsigned long addr, - unsigned long old, - unsigned long val, + physaddr_t old, + physaddr_t val, unsigned int bytes, unsigned int do_cmpxchg) { @@ -2601,21 +2738,22 @@ static int ptwr_emulated_update( } /* Turn a sub-word access into a full-word access. */ - /* FIXME: needs tweaks for PAE */ - if ( (addr & ((BITS_PER_LONG/8)-1)) != 0 ) + if (bytes != sizeof(physaddr_t)) { int rc; - unsigned long full; - unsigned int mask = addr & ((BITS_PER_LONG/8)-1); + physaddr_t full; + unsigned int offset = addr & (sizeof(physaddr_t)-1); + /* Align address; read full word. */ - addr &= ~((BITS_PER_LONG/8)-1); - if ( (rc = x86_emulate_read_std(addr, &full, BITS_PER_LONG/8)) ) - return rc; + addr &= ~(sizeof(physaddr_t)-1); + if ( (rc = x86_emulate_read_std(addr, (unsigned long *)&full, + sizeof(physaddr_t))) ) + return rc; /* Mask out bits provided by caller. */ - full &= ~((1UL << (bytes*8)) - 1UL) << (mask*8); + full &= ~((((physaddr_t)1 << (bytes*8)) - 1) << (offset*8)); /* Shift the caller value and OR in the missing bits. */ - val &= (1UL << (bytes*8)) - 1UL; - val <<= mask*8; + val &= (((physaddr_t)1 << (bytes*8)) - 1); + val <<= (offset)*8; val |= full; } @@ -2635,8 +2773,8 @@ static int ptwr_emulated_update( ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) || (page_get_owner(page) != d) ) { - MEM_LOG("ptwr_emulate: Page is mistyped or bad pte (%lx, %08x)\n", - l1e_get_pfn(pte), page->u.inuse.type_info); + MEM_LOG("ptwr_emulate: Page is mistyped or bad pte (%llx, %08x)\n", + (u64)l1e_get_pfn(pte), page->u.inuse.type_info); return X86EMUL_UNHANDLEABLE; } @@ -2870,7 +3008,23 @@ void ptwr_destroy(struct domain *d) free_xenheap_page((unsigned long)d->arch.ptwr[PTWR_PT_INACTIVE].page); } +/* for printk debugging ;) */ +void printk_page_flags(u32 flags) +{ + static const char *names[12] = { + "present", "rw", "user", "pwt", + "pcd", "accessed", "dirty", "pat/pse", + "global", "os#1", "os#2", "os#3" + }; + int i, first = 1; + for (i = 11; i >= 0; i--) { + if (!(flags & (1<<i))) + continue; + printk("%s%s", first ? "flags=" :",", names[i]); + first=0; + } +} /************************************************************************/ /************************************************************************/ Index: xen/include/asm-x86/x86_32/page.h =================================================================== --- xen.orig/include/asm-x86/x86_32/page.h 2005-05-13 12:37:10.000000000 +0200 +++ xen/include/asm-x86/x86_32/page.h 2005-05-13 12:37:42.000000000 +0200 @@ -2,129 +2,38 @@ #ifndef __X86_32_PAGE_H__ #define __X86_32_PAGE_H__ -#define L1_PAGETABLE_SHIFT 12 -#define L2_PAGETABLE_SHIFT 22 -#define PAGE_SHIFT L1_PAGETABLE_SHIFT -#define ROOT_PAGETABLE_SHIFT L2_PAGETABLE_SHIFT - -#define PAGETABLE_ORDER 10 -#define L1_PAGETABLE_ENTRIES (1<<PAGETABLE_ORDER) -#define L2_PAGETABLE_ENTRIES (1<<PAGETABLE_ORDER) -#define ROOT_PAGETABLE_ENTRIES L2_PAGETABLE_ENTRIES - #define __PAGE_OFFSET (0xFF000000) -#define PADDR_BITS 32 -#define VADDR_BITS 32 #define PADDR_MASK (~0UL) #define VADDR_MASK (~0UL) #define _PAGE_NX 0UL #define PAGE_FLAG_MASK 0xfff -#ifndef __ASSEMBLY__ #include <xen/config.h> -#include <asm/types.h> -typedef struct { u32 l1_lo; } l1_pgentry_t; -typedef struct { u32 l2_lo; } l2_pgentry_t; -typedef l2_pgentry_t root_pgentry_t; - -/* read access (deprecated) */ -#define l1e_get_value(_x) ((unsigned long)((_x).l1_lo)) -#define l2e_get_value(_x) ((unsigned long)((_x).l2_lo)) - -/* read access */ -#define l1e_get_pfn(_x) ((unsigned long)((_x).l1_lo >> PAGE_SHIFT)) -#define l1e_get_phys(_x) ((unsigned long)((_x).l1_lo & PAGE_MASK)) -#define l1e_get_flags(_x) ((unsigned long)((_x).l1_lo & PAGE_FLAG_MASK)) - -#define l2e_get_pfn(_x) ((unsigned long)((_x).l2_lo >> PAGE_SHIFT)) -#define l2e_get_phys(_x) ((unsigned long)((_x).l2_lo & PAGE_MASK)) -#define l2e_get_flags(_x) ((unsigned long)((_x).l2_lo & PAGE_FLAG_MASK)) - -/* write access */ -static inline l1_pgentry_t l1e_empty(void) -{ - l1_pgentry_t e = { .l1_lo = 0 }; - return e; -} -static inline l1_pgentry_t l1e_create_pfn(u32 pfn, u32 flags) -{ - l1_pgentry_t e = { .l1_lo = (pfn << PAGE_SHIFT) | flags }; - return e; -} -static inline l1_pgentry_t l1e_create_phys(u32 addr, u32 flags) -{ - l1_pgentry_t e = { .l1_lo = (addr & PAGE_MASK) | flags }; - return e; -} -static inline void l1e_add_flags(l1_pgentry_t *e, u32 flags) -{ - e->l1_lo |= flags; -} -static inline void l1e_remove_flags(l1_pgentry_t *e, u32 flags) -{ - e->l1_lo &= ~flags; -} - -static inline l2_pgentry_t l2e_empty(void) -{ - l2_pgentry_t e = { .l2_lo = 0 }; - return e; -} -static inline l2_pgentry_t l2e_create_pfn(u32 pfn, u32 flags) -{ - l2_pgentry_t e = { .l2_lo = (pfn << PAGE_SHIFT) | flags }; - return e; -} -static inline l2_pgentry_t l2e_create_phys(u32 addr, u32 flags) -{ - l2_pgentry_t e = { .l2_lo = (addr & PAGE_MASK) | flags }; - return e; -} -static inline void l2e_add_flags(l2_pgentry_t *e, u32 flags) -{ - e->l2_lo |= flags; -} -static inline void l2e_remove_flags(l2_pgentry_t *e, u32 flags) -{ - e->l2_lo &= ~flags; -} - -/* check entries */ -static inline int l1e_has_changed(l1_pgentry_t *e1, l1_pgentry_t *e2, u32 flags) -{ - return ((e1->l1_lo ^ e2->l1_lo) & (PAGE_MASK | flags)) != 0; -} -static inline int l2e_has_changed(l2_pgentry_t *e1, l2_pgentry_t *e2, u32 flags) -{ - return ((e1->l2_lo ^ e2->l2_lo) & (PAGE_MASK | flags)) != 0; -} - -#endif /* !__ASSEMBLY__ */ +#ifdef CONFIG_X86_PAE +# include <asm/x86_32/page-3l.h> +#else +# include <asm/x86_32/page-2l.h> +#endif /* Pagetable walking. */ #define l2e_to_l1e(_x) \ ((l1_pgentry_t *)__va(l2e_get_phys(_x))) +#define l3e_to_l2e(_x) \ + ((l2_pgentry_t *)__va(l3e_get_phys(_x))) /* Given a virtual address, get an entry offset into a page table. */ #define l1_table_offset(_a) \ (((_a) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1)) #define l2_table_offset(_a) \ - ((_a) >> L2_PAGETABLE_SHIFT) + (((_a) >> L2_PAGETABLE_SHIFT) & (L2_PAGETABLE_ENTRIES - 1)) +#define l3_table_offset(_a) \ + ((_a) >> L3_PAGETABLE_SHIFT) /* Given a virtual address, get an entry offset into a linear page table. */ -#define l1_linear_offset(_a) ((_a) >> PAGE_SHIFT) - -#define is_guest_l1_slot(_s) (1) -#define is_guest_l2_slot(_s) ((_s) < ROOT_PAGETABLE_FIRST_XEN_SLOT) - -#define root_get_pfn l2e_get_pfn -#define root_get_flags l2e_get_flags -#define root_get_value l2e_get_value -#define root_empty l2e_empty -#define root_create_phys l2e_create_phys -#define PGT_root_page_table PGT_l2_page_table +#define l1_linear_offset(_a) ((_a) >> L1_PAGETABLE_SHIFT) +#define l2_linear_offset(_a) ((_a) >> L2_PAGETABLE_SHIFT) #define L1_DISALLOW_MASK (3UL << 7) #define L2_DISALLOW_MASK (7UL << 7) Index: xen/arch/x86/x86_32/traps.c =================================================================== --- xen.orig/arch/x86/x86_32/traps.c 2005-05-13 12:37:10.000000000 +0200 +++ xen/arch/x86/x86_32/traps.c 2005-05-13 12:37:42.000000000 +0200 @@ -160,21 +160,24 @@ void show_registers(struct cpu_user_regs void show_page_walk(unsigned long addr) { - unsigned long page; + l2_pgentry_t pmd; + l1_pgentry_t *pte; if ( addr < PAGE_OFFSET ) return; printk("Pagetable walk from %08lx:\n", addr); - page = l2e_get_value(idle_pg_table[l2_table_offset(addr)]); - printk(" L2 = %08lx %s\n", page, (page & _PAGE_PSE) ? "(4MB)" : ""); - if ( !(page & _PAGE_PRESENT) || (page & _PAGE_PSE) ) + pmd = idle_pg_table_l2[l2_linear_offset(addr)]; + printk(" L2 = %08llx %s\n", (u64)l2e_get_value(pmd), + (l2e_get_flags(pmd) & _PAGE_PSE) ? "(2/4MB)" : ""); + if ( !(l2e_get_flags(pmd) & _PAGE_PRESENT) || + (l2e_get_flags(pmd) & _PAGE_PSE) ) return; - page &= PAGE_MASK; - page = ((unsigned long *) __va(page))[l1_table_offset(addr)]; - printk(" L1 = %08lx\n", page); + pte = __va(l2e_get_phys(pmd)); + pte += l1_table_offset(addr); + printk(" L1 = %08llx\n", (u64)l1e_get_value(*pte)); } #define DOUBLEFAULT_STACK_SIZE 1024 Index: xen/arch/x86/x86_32/mm.c =================================================================== --- xen.orig/arch/x86/x86_32/mm.c 2005-05-13 12:37:10.000000000 +0200 +++ xen/arch/x86/x86_32/mm.c 2005-05-13 15:14:18.000000000 +0200 @@ -36,13 +36,21 @@ int map_pages( unsigned long s, unsigned long flags) { +#if CONFIG_PAGING_LEVELS == 3 + l3_pgentry_t *pl3e; +#endif l2_pgentry_t *pl2e; l1_pgentry_t *pl1e; void *newpg; while ( s != 0 ) { +#if CONFIG_PAGING_LEVELS == 3 + pl3e = &pt[l3_table_offset(v)]; + pl2e = l3e_to_l2e(*pl3e) + l2_table_offset(v); +#else pl2e = &pt[l2_table_offset(v)]; +#endif if ( ((s|v|p) & ((1<<L2_PAGETABLE_SHIFT)-1)) == 0 ) { @@ -90,58 +98,83 @@ void __set_fixmap( void __init paging_init(void) { void *ioremap_pt; - unsigned long v; + unsigned long v,v2,i; struct pfn_info *pg; +#ifdef CONFIG_X86_PAE + printk("PAE enabled, limit: %d GB\n", MACHPHYS_MBYTES); +#else + printk("PAE disabled.\n"); +#endif + /* Allocate and map the machine-to-phys table. */ - if ( (pg = alloc_domheap_pages(NULL, 10)) == NULL ) - panic("Not enough memory to bootstrap Xen.\n"); - idle_pg_table[l2_table_offset(RDWR_MPT_VIRT_START)] = - l2e_create_phys(page_to_phys(pg), __PAGE_HYPERVISOR | _PAGE_PSE); - memset((void *)RDWR_MPT_VIRT_START, 0x55, 4UL << 20); + /* Create read-only mapping of MPT for guest-OS use. */ + for (v = RDWR_MPT_VIRT_START, v2 = RO_MPT_VIRT_START; + v != RDWR_MPT_VIRT_END /* && (max_page * 4) >= (v - RDWR_MPT_VIRT_START) */; + v += (1 << L2_PAGETABLE_SHIFT), v2 += (1 << L2_PAGETABLE_SHIFT)) { + if ( (pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER)) == NULL ) + panic("Not enough memory to bootstrap Xen.\n"); + idle_pg_table_l2[l2_linear_offset(v)] = + l2e_create_phys(page_to_phys(pg), + __PAGE_HYPERVISOR | _PAGE_PSE); + idle_pg_table_l2[l2_linear_offset(v2)] = + l2e_create_phys(page_to_phys(pg), + (__PAGE_HYPERVISOR | _PAGE_PSE) & ~_PAGE_RW); + } + memset((void *)RDWR_MPT_VIRT_START, 0x55, v - RDWR_MPT_VIRT_START); - /* Xen 4MB mappings can all be GLOBAL. */ + /* Xen 2/4MB mappings can all be GLOBAL. */ if ( cpu_has_pge ) { - for ( v = HYPERVISOR_VIRT_START; v; v += (1 << L2_PAGETABLE_SHIFT) ) - { - if (l2e_get_flags(idle_pg_table[l2_table_offset(v)]) & _PAGE_PSE) - l2e_add_flags(&idle_pg_table[l2_table_offset(v)], - _PAGE_GLOBAL); + for ( v = HYPERVISOR_VIRT_START; v; v += (1 << L2_PAGETABLE_SHIFT) ) { + if (!l2e_get_flags(idle_pg_table_l2[l2_linear_offset(v)]) & _PAGE_PSE) + continue; + if (v >= RO_MPT_VIRT_START && v < RO_MPT_VIRT_END) + continue; + l2e_add_flags(&idle_pg_table_l2[l2_linear_offset(v)], + _PAGE_GLOBAL); } } - /* Create page table for ioremap(). */ - ioremap_pt = (void *)alloc_xenheap_page(); - clear_page(ioremap_pt); - idle_pg_table[l2_table_offset(IOREMAP_VIRT_START)] = - l2e_create_phys(__pa(ioremap_pt), __PAGE_HYPERVISOR); - - /* Create read-only mapping of MPT for guest-OS use. - * NB. Remove the global bit so that shadow_mode_translate()==true domains - * can reused this address space for their phys-to-machine mapping. - */ - idle_pg_table[l2_table_offset(RO_MPT_VIRT_START)] = - l2e_create_pfn(l2e_get_pfn(idle_pg_table[l2_table_offset(RDWR_MPT_VIRT_START)]), - l2e_get_flags(idle_pg_table[l2_table_offset(RDWR_MPT_VIRT_START)]) - & ~(_PAGE_RW | _PAGE_GLOBAL)); + /* Create page table(s) for ioremap(). */ + for (v = IOREMAP_VIRT_START; v != IOREMAP_VIRT_END; v += (1 << L2_PAGETABLE_SHIFT)) { + ioremap_pt = (void *)alloc_xenheap_page(); + clear_page(ioremap_pt); + idle_pg_table_l2[l2_linear_offset(v)] = + l2e_create_phys(__pa(ioremap_pt), __PAGE_HYPERVISOR); + } /* Set up mapping cache for domain pages. */ - mapcache = (l1_pgentry_t *)alloc_xenheap_page(); - clear_page(mapcache); - idle_pg_table[l2_table_offset(MAPCACHE_VIRT_START)] = - l2e_create_phys(__pa(mapcache), __PAGE_HYPERVISOR); + mapcache = (l1_pgentry_t*)alloc_xenheap_pages(10-PAGETABLE_ORDER); + for (v = MAPCACHE_VIRT_START, i = 0; + v != MAPCACHE_VIRT_END; + v += (1 << L2_PAGETABLE_SHIFT), i++) { + clear_page(mapcache + i*L1_PAGETABLE_ENTRIES); + idle_pg_table_l2[l2_linear_offset(v)] = + l2e_create_phys(__pa(mapcache + i*L1_PAGETABLE_ENTRIES), + __PAGE_HYPERVISOR); + } - /* Set up linear page table mapping. */ - idle_pg_table[l2_table_offset(LINEAR_PT_VIRT_START)] = - l2e_create_phys(__pa(idle_pg_table), __PAGE_HYPERVISOR); + for (v = LINEAR_PT_VIRT_START; v != LINEAR_PT_VIRT_END; v += (1 << L2_PAGETABLE_SHIFT)) { + idle_pg_table_l2[l2_linear_offset(v)] = + l2e_create_phys(__pa(idle_pg_table_l2) + ((v-RDWR_MPT_VIRT_START) >> PAGETABLE_ORDER), + __PAGE_HYPERVISOR); + } } -void __init zap_low_mappings(void) +void __init zap_low_mappings(l2_pgentry_t *base) { int i; - for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ ) - idle_pg_table[i] = l2e_empty(); + u32 addr; + + for (i = 0; ; i++) { + addr = (i << L2_PAGETABLE_SHIFT); + if (addr >= HYPERVISOR_VIRT_START) + break; + if (l2e_get_phys(base[i]) != addr) + continue; + base[i] = l2e_empty(); + } flush_tlb_all_pge(); } @@ -163,12 +196,13 @@ void subarch_init_memory(struct domain * offsetof(struct pfn_info, count_info), offsetof(struct pfn_info, u.inuse._domain), sizeof(struct pfn_info)); - for ( ; ; ) ; + for ( ; ; ) + __asm__ __volatile__ ( "hlt" ); } /* M2P table is mappable read-only by privileged domains. */ m2p_start_mfn = l2e_get_pfn( - idle_pg_table[l2_table_offset(RDWR_MPT_VIRT_START)]); + idle_pg_table_l2[l2_linear_offset(RDWR_MPT_VIRT_START)]); for ( i = 0; i < 1024; i++ ) { frame_table[m2p_start_mfn+i].count_info = PGC_allocated | 1; @@ -320,7 +354,7 @@ void *memguard_init(void *heap_start) l1[j] = l1e_create_phys((i << L2_PAGETABLE_SHIFT) | (j << L1_PAGETABLE_SHIFT), __PAGE_HYPERVISOR); - idle_pg_table[i + l2_table_offset(PAGE_OFFSET)] = + idle_pg_table_l2[i + l2_linear_offset(PAGE_OFFSET)] = l2e_create_phys(virt_to_phys(l1), __PAGE_HYPERVISOR); } @@ -342,7 +376,7 @@ static void __memguard_change_range(void while ( _l != 0 ) { - l2 = &idle_pg_table[l2_table_offset(_p)]; + l2 = &idle_pg_table_l2[l2_linear_offset(_p)]; l1 = l2e_to_l1e(*l2) + l1_table_offset(_p); if ( guard ) l1e_remove_flags(l1, _PAGE_PRESENT); Index: xen/include/public/arch-x86_32.h =================================================================== --- xen.orig/include/public/arch-x86_32.h 2005-05-13 12:37:11.000000000 +0200 +++ xen/include/public/arch-x86_32.h 2005-05-13 12:37:42.000000000 +0200 @@ -72,7 +72,11 @@ * Virtual addresses beyond this are not modifiable by guest OSes. The * machine->physical mapping table starts at this address, read-only. */ -#define HYPERVISOR_VIRT_START (0xFC000000UL) +#ifdef CONFIG_X86_PAE +# define HYPERVISOR_VIRT_START (0xF5800000UL) +#else +# define HYPERVISOR_VIRT_START (0xFC000000UL) +#endif #ifndef machine_to_phys_mapping #define machine_to_phys_mapping ((u32 *)HYPERVISOR_VIRT_START) #endif Index: xen/common/elf.c =================================================================== --- xen.orig/common/elf.c 2005-05-13 12:37:10.000000000 +0200 +++ xen/common/elf.c 2005-05-13 12:37:42.000000000 +0200 @@ -82,7 +82,20 @@ int parseelfimage(struct domain_setup_in printk("ERROR: Xen will only load images built for Xen v3.0\n"); return -EINVAL; } - +#if defined(__i386__) +#ifdef CONFIG_X86_PAE + int xen_pae = 1; +#else + int xen_pae = 0; +#endif + int guest_pae = strstr(guestinfo, "PAE=yes") ? 1 : 0; + if (xen_pae != guest_pae) { + printk("ERROR: PAE mode mismatch (xen=%s,guest=%s)\n", + xen_pae ? "yes" : "no", + guest_pae ? "yes" : "no"); + return -EINVAL; + } +#endif break; } if ( guestinfo == NULL ) Index: xen/arch/x86/x86_32/domain_page.c =================================================================== --- xen.orig/arch/x86/x86_32/domain_page.c 2005-05-13 12:37:11.000000000 +0200 +++ xen/arch/x86/x86_32/domain_page.c 2005-05-13 12:37:42.000000000 +0200 @@ -72,7 +72,7 @@ void *map_domain_mem(unsigned long pa) shadow_epoch[cpu] = ++epoch; } } - while ( l1e_get_value(cache[idx]) != 0 ); + while ( l1e_get_flags(cache[idx]) & _PAGE_PRESENT ); cache[idx] = l1e_create_phys(pa, __PAGE_HYPERVISOR); Index: xen/include/asm-x86/mm.h =================================================================== --- xen.orig/include/asm-x86/mm.h 2005-05-13 12:37:10.000000000 +0200 +++ xen/include/asm-x86/mm.h 2005-05-13 12:37:42.000000000 +0200 @@ -76,15 +76,15 @@ struct pfn_info /* Owning guest has pinned this page to its current type? */ #define _PGT_pinned 27 #define PGT_pinned (1U<<_PGT_pinned) - /* The 10 most significant bits of virt address if this is a page table. */ -#define PGT_va_shift 17 -#define PGT_va_mask (((1U<<10)-1)<<PGT_va_shift) + /* The 11 most significant bits of virt address if this is a page table. */ +#define PGT_va_shift 16 +#define PGT_va_mask (((1U<<11)-1)<<PGT_va_shift) /* Is the back pointer still mutable (i.e. not fixed yet)? */ -#define PGT_va_mutable (((1U<<10)-1)<<PGT_va_shift) +#define PGT_va_mutable (((1U<<11)-1)<<PGT_va_shift) /* Is the back pointer unknown (e.g., p.t. is mapped at multiple VAs)? */ -#define PGT_va_unknown (((1U<<10)-2)<<PGT_va_shift) - /* 17-bit count of uses of this frame as its current type. */ -#define PGT_count_mask ((1U<<17)-1) +#define PGT_va_unknown (((1U<<11)-2)<<PGT_va_shift) + /* 16-bit count of uses of this frame as its current type. */ +#define PGT_count_mask ((1U<<16)-1) #define PGT_mfn_mask ((1U<<20)-1) /* mfn mask for shadow types */ Index: xen/arch/x86/shadow.c =================================================================== --- xen.orig/arch/x86/shadow.c 2005-05-13 12:37:11.000000000 +0200 +++ xen/arch/x86/shadow.c 2005-05-13 12:55:14.000000000 +0200 @@ -358,13 +358,13 @@ free_shadow_hl2_table(struct domain *d, } static void inline -free_shadow_l2_table(struct domain *d, unsigned long smfn) +free_shadow_l2_table(struct domain *d, unsigned long smfn, unsigned int type) { l2_pgentry_t *pl2e = map_domain_mem(smfn << PAGE_SHIFT); int i, external = shadow_mode_external(d); for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - if ( external || is_guest_l2_slot(i) ) + if ( external || is_guest_l2_slot(type, i) ) if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT ) put_shadow_ref(l2e_get_pfn(pl2e[i])); @@ -404,7 +404,7 @@ void free_shadow_page(unsigned long smfn case PGT_l2_shadow: perfc_decr(shadow_l2_pages); shadow_demote(d, gpfn, gmfn); - free_shadow_l2_table(d, smfn); + free_shadow_l2_table(d, smfn, page->u.inuse.type_info); break; case PGT_hl2_shadow: @@ -1093,7 +1093,8 @@ translate_l1pgtable(struct domain *d, l1 // up dom0. // void -translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn) +translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn, + unsigned int type) { int i; l2_pgentry_t *l2; @@ -1103,7 +1104,7 @@ translate_l2pgtable(struct domain *d, l1 l2 = map_domain_mem(l2mfn << PAGE_SHIFT); for (i = 0; i < L2_PAGETABLE_ENTRIES; i++) { - if ( is_guest_l2_slot(i) && + if ( is_guest_l2_slot(type, i) && (l2e_get_flags(l2[i]) & _PAGE_PRESENT) ) { unsigned long mfn = l2e_get_pfn(l2[i]); @@ -1409,8 +1410,8 @@ gpfn_to_mfn_foreign(struct domain *d, un unmap_domain_mem(l2); if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) { - printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => 0 l2e=%lx\n", - d->domain_id, gpfn, l2e_get_value(l2e)); + printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => 0 l2e=%llx\n", + d->domain_id, gpfn, (u64)l2e_get_value(l2e)); return INVALID_MFN; } unsigned long l1tab = l2e_get_phys(l2e); @@ -1425,8 +1426,8 @@ gpfn_to_mfn_foreign(struct domain *d, un if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) ) { - printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => 0 l1e=%lx\n", - d->domain_id, gpfn, l1e_get_value(l1e)); + printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => 0 l1e=%llx\n", + d->domain_id, gpfn, (u64)l1e_get_value(l1e)); return INVALID_MFN; } @@ -2389,7 +2390,10 @@ static int resync_all(struct domain *d, changed = 0; for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) { - if ( !is_guest_l2_slot(i) && !external ) +#if CONFIG_X86_PAE + BUG(); /* FIXME: need type_info */ +#endif + if ( !is_guest_l2_slot(0,i) && !external ) continue; l2_pgentry_t new_pde = guest2[i]; @@ -2432,7 +2436,10 @@ static int resync_all(struct domain *d, changed = 0; for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) { - if ( !is_guest_l2_slot(i) && !external ) +#if CONFIG_X86_PAE + BUG(); /* FIXME: need type_info */ +#endif + if ( !is_guest_l2_slot(0, i) && !external ) continue; l2_pgentry_t new_pde = guest2[i]; @@ -2645,8 +2652,8 @@ int shadow_fault(unsigned long va, struc &gpte, sizeof(gpte))) ) { printk("%s() failed, crashing domain %d " - "due to a read-only L2 page table (gpde=%lx), va=%lx\n", - __func__, d->domain_id, l2e_get_value(gpde), va); + "due to a read-only L2 page table (gpde=%llx), va=%lx\n", + __func__,d->domain_id, (u64)l2e_get_value(gpde), va); domain_crash_synchronous(); } @@ -2719,7 +2726,7 @@ void shadow_l2_normal_pt_update( shadow_unlock(d); } -#ifdef __x86_64__ +#if CONFIG_PAGING_LEVELS >= 3 void shadow_l3_normal_pt_update( struct domain *d, unsigned long pa, l3_pgentry_t gpde, @@ -2727,7 +2734,9 @@ void shadow_l3_normal_pt_update( { BUG(); // not yet implemented } +#endif +#if CONFIG_PAGING_LEVELS >= 4 void shadow_l4_normal_pt_update( struct domain *d, unsigned long pa, l4_pgentry_t gpde, Index: xen/include/asm-x86/types.h =================================================================== --- xen.orig/include/asm-x86/types.h 2005-05-13 12:37:10.000000000 +0200 +++ xen/include/asm-x86/types.h 2005-05-13 12:37:42.000000000 +0200 @@ -44,11 +44,17 @@ typedef signed long long s64; typedef unsigned long long u64; #define BITS_PER_LONG 32 typedef unsigned int size_t; +#if defined(CONFIG_X86_PAE) +typedef u64 physaddr_t; +#else +typedef u32 physaddr_t; +#endif #elif defined(__x86_64__) typedef signed long s64; typedef unsigned long u64; #define BITS_PER_LONG 64 typedef unsigned long size_t; +typedef u64 physaddr_t; #endif /* DMA addresses come in generic and 64-bit flavours. */ Index: xen/include/asm-x86/shadow.h =================================================================== --- xen.orig/include/asm-x86/shadow.h 2005-05-13 12:37:10.000000000 +0200 +++ xen/include/asm-x86/shadow.h 2005-05-13 12:51:23.000000000 +0200 @@ -126,10 +126,12 @@ extern void shadow_l1_normal_pt_update(s extern void shadow_l2_normal_pt_update(struct domain *d, unsigned long pa, l2_pgentry_t l2e, struct map_dom_mem_cache *cache); -#ifdef __x86_64__ +#if CONFIG_PAGING_LEVELS >= 3 extern void shadow_l3_normal_pt_update(struct domain *d, unsigned long pa, l3_pgentry_t l3e, struct map_dom_mem_cache *cache); +#endif +#if CONFIG_PAGING_LEVELS >= 4 extern void shadow_l4_normal_pt_update(struct domain *d, unsigned long pa, l4_pgentry_t l4e, struct map_dom_mem_cache *cache); Index: xen/include/asm-x86/smp.h =================================================================== --- xen.orig/include/asm-x86/smp.h 2005-05-13 12:37:10.000000000 +0200 +++ xen/include/asm-x86/smp.h 2005-05-13 12:47:49.000000000 +0200 @@ -38,7 +38,7 @@ extern cpumask_t cpu_sibling_map[]; extern void smp_flush_tlb(void); extern void smp_invalidate_rcv(void); /* Process an NMI */ extern void (*mtrr_hook) (void); -extern void zap_low_mappings (void); +extern void zap_low_mappings(l2_pgentry_t *base); #define MAX_APICID 256 extern u8 x86_cpu_to_apicid[]; _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |