[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH 12/22] mini-os: add x86 native page table handling
Juergen Gross, on Tue 23 Aug 2016 17:15:58 +0200, wrote: > For support of HVMlite don't use mmu_update hypercalls, but write the > page table entries directly. > > Signed-off-by: Juergen Gross <jgross@xxxxxxxx> Reviewed-by: Samuel Thibault <samuel.thibault@xxxxxxxxxxxx> > --- > arch/x86/mm.c | 147 > +++++++++++++++++++++++++++++++++++++------------- > arch/x86/traps.c | 10 ++++ > include/x86/arch_mm.h | 4 ++ > include/x86/os.h | 9 ++++ > 4 files changed, 132 insertions(+), 38 deletions(-) > > diff --git a/arch/x86/mm.c b/arch/x86/mm.c > index cbb5617..f5248a4 100644 > --- a/arch/x86/mm.c > +++ b/arch/x86/mm.c > @@ -123,16 +123,25 @@ void arch_mm_preinit(void *p) > * table at offset in previous level MFN (pref_l_mfn). pt_pfn is a guest > * PFN. > */ > +static pgentry_t pt_prot[PAGETABLE_LEVELS] = { > + L1_PROT, > + L2_PROT, > + L3_PROT, > +#if defined(__x86_64__) > + L4_PROT, > +#endif > +}; > + > static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, > unsigned long offset, unsigned long level) > { > - pgentry_t *tab = pt_base; > + pgentry_t *tab; > unsigned long pt_page = (unsigned long)pfn_to_virt(*pt_pfn); > - pgentry_t prot_e, prot_t; > +#ifdef CONFIG_PARAVIRT > mmu_update_t mmu_updates[1]; > int rc; > +#endif > > - prot_e = prot_t = 0; > DEBUG("Allocating new L%d pt frame for pfn=%lx, " > "prev_l_mfn=%lx, offset=%lx", > level, *pt_pfn, prev_l_mfn, offset); > @@ -140,30 +149,12 @@ static void new_pt_frame(unsigned long *pt_pfn, > unsigned long prev_l_mfn, > /* We need to clear the page, otherwise we might fail to map it > as a page table page */ > memset((void*) pt_page, 0, PAGE_SIZE); > - > - switch ( level ) > - { > - case L1_FRAME: > - prot_e = L1_PROT; > - prot_t = L2_PROT; > - break; > - case L2_FRAME: > - prot_e = L2_PROT; > - prot_t = L3_PROT; > - break; > -#if defined(__x86_64__) > - case L3_FRAME: > - prot_e = L3_PROT; > - prot_t = L4_PROT; > - break; > -#endif > - default: > - printk("new_pt_frame() called with invalid level number %lu\n", > level); > - do_exit(); > - break; > - } > > + ASSERT(level >= 1 && level <= PAGETABLE_LEVELS); > + > +#ifdef CONFIG_PARAVIRT > /* Make PFN a page table page */ > + tab = pt_base; > #if defined(__x86_64__) > tab = pte_to_virt(tab[l4_table_offset(pt_page)]); > #endif > @@ -172,7 +163,7 @@ static void new_pt_frame(unsigned long *pt_pfn, unsigned > long prev_l_mfn, > mmu_updates[0].ptr = (tab[l2_table_offset(pt_page)] & PAGE_MASK) + > sizeof(pgentry_t) * l1_table_offset(pt_page); > mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | > - (prot_e & ~_PAGE_RW); > + (pt_prot[level - 1] & ~_PAGE_RW); > > if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 ) > { > @@ -184,13 +175,18 @@ static void new_pt_frame(unsigned long *pt_pfn, > unsigned long prev_l_mfn, > /* Hook the new page table page into the hierarchy */ > mmu_updates[0].ptr = > ((pgentry_t)prev_l_mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset; > - mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | > prot_t; > + mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | > + pt_prot[level]; > > if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 > ) > { > printk("ERROR: mmu_update failed with rc=%d\n", rc); > do_exit(); > } > +#else > + tab = mfn_to_virt(prev_l_mfn); > + tab[offset] = (*pt_pfn << PAGE_SHIFT) | pt_prot[level]; > +#endif > > *pt_pfn += 1; > } > @@ -202,12 +198,14 @@ static void build_pagetable(unsigned long *start_pfn, > unsigned long *max_pfn) > { > unsigned long start_address, end_address; > unsigned long pfn_to_map, pt_pfn = *start_pfn; > - static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1]; > pgentry_t *tab = pt_base, page; > unsigned long pt_mfn = pfn_to_mfn(virt_to_pfn(pt_base)); > unsigned long offset; > +#ifdef CONFIG_PARAVIRT > + static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1]; > int count = 0; > int rc; > +#endif > > /* Be conservative: even if we know there will be more pages already > mapped, start the loop at the very beginning. */ > @@ -225,6 +223,10 @@ static void build_pagetable(unsigned long *start_pfn, > unsigned long *max_pfn) > ((unsigned long)pfn_to_virt(*max_pfn) - > (unsigned long)&_text)>>20); > } > +#else > + /* Round up to next 2MB boundary as we are using 2MB pages on HVMlite. */ > + pfn_to_map = (pfn_to_map + L1_PAGETABLE_ENTRIES - 1) & > + ~(L1_PAGETABLE_ENTRIES - 1); > #endif > > start_address = (unsigned long)pfn_to_virt(pfn_to_map); > @@ -257,6 +259,7 @@ static void build_pagetable(unsigned long *start_pfn, > unsigned long *max_pfn) > pt_mfn = pte_to_mfn(page); > tab = to_virt(mfn_to_pfn(pt_mfn) << PAGE_SHIFT); > offset = l2_table_offset(start_address); > +#ifdef CONFIG_PARAVIRT > /* Need new L1 pt frame */ > if ( !(tab[offset] & _PAGE_PRESENT) ) > new_pt_frame(&pt_pfn, pt_mfn, offset, L1_FRAME); > @@ -288,6 +291,12 @@ static void build_pagetable(unsigned long *start_pfn, > unsigned long *max_pfn) > count = 0; > } > start_address += PAGE_SIZE; > +#else > + if ( !(tab[offset] & _PAGE_PRESENT) ) > + tab[offset] = (pgentry_t)pfn_to_map << PAGE_SHIFT | > + L2_PROT | _PAGE_PSE; > + start_address += 1UL << L2_PAGETABLE_SHIFT; > +#endif > } > > *start_pfn = pt_pfn; > @@ -302,16 +311,19 @@ static void set_readonly(void *text, void *etext) > unsigned long start_address = > ((unsigned long) text + PAGE_SIZE - 1) & PAGE_MASK; > unsigned long end_address = (unsigned long) etext; > - static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1]; > pgentry_t *tab = pt_base, page; > unsigned long mfn = pfn_to_mfn(virt_to_pfn(pt_base)); > unsigned long offset; > + unsigned long page_size = PAGE_SIZE; > +#ifdef CONFIG_PARAVIRT > + static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1]; > int count = 0; > int rc; > +#endif > > printk("setting %p-%p readonly\n", text, etext); > > - while ( start_address + PAGE_SIZE <= end_address ) > + while ( start_address + page_size <= end_address ) > { > tab = pt_base; > mfn = pfn_to_mfn(virt_to_pfn(pt_base)); > @@ -327,26 +339,34 @@ static void set_readonly(void *text, void *etext) > mfn = pte_to_mfn(page); > tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT); > offset = l2_table_offset(start_address); > - page = tab[offset]; > - mfn = pte_to_mfn(page); > - tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT); > + if ( !(tab[offset] & _PAGE_PSE) ) > + { > + page = tab[offset]; > + mfn = pte_to_mfn(page); > + tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT); > > - offset = l1_table_offset(start_address); > + offset = l1_table_offset(start_address); > + } > > if ( start_address != (unsigned long)&shared_info ) > { > +#ifdef CONFIG_PARAVIRT > mmu_updates[count].ptr = > ((pgentry_t)mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset; > mmu_updates[count].val = tab[offset] & ~_PAGE_RW; > count++; > +#else > + tab[offset] &= ~_PAGE_RW; > +#endif > } > else > printk("skipped %lx\n", start_address); > > - start_address += PAGE_SIZE; > + start_address += page_size; > > +#ifdef CONFIG_PARAVIRT > if ( count == L1_PAGETABLE_ENTRIES || > - start_address + PAGE_SIZE > end_address ) > + start_address + page_size > end_address ) > { > rc = HYPERVISOR_mmu_update(mmu_updates, count, NULL, DOMID_SELF); > if ( rc < 0 ) > @@ -356,8 +376,13 @@ static void set_readonly(void *text, void *etext) > } > count = 0; > } > +#else > + if ( start_address == (1UL << L2_PAGETABLE_SHIFT) ) > + page_size = 1UL << L2_PAGETABLE_SHIFT; > +#endif > } > > +#ifdef CONFIG_PARAVIRT > { > mmuext_op_t op = { > .cmd = MMUEXT_TLB_FLUSH_ALL, > @@ -365,6 +390,9 @@ static void set_readonly(void *text, void *etext) > int count; > HYPERVISOR_mmuext_op(&op, 1, &count, DOMID_SELF); > } > +#else > + write_cr3((unsigned long)pt_base); > +#endif > } > > /* > @@ -394,6 +422,8 @@ static pgentry_t *get_pgt(unsigned long va) > offset = l2_table_offset(va); > if ( !(tab[offset] & _PAGE_PRESENT) ) > return NULL; > + if ( tab[offset] & _PAGE_PSE ) > + return &tab[offset]; > mfn = pte_to_mfn(tab[offset]); > tab = mfn_to_virt(mfn); > offset = l1_table_offset(va); > @@ -448,6 +478,9 @@ pgentry_t *need_pgt(unsigned long va) > new_pt_frame(&pt_pfn, pt_mfn, offset, L1_FRAME); > } > ASSERT(tab[offset] & _PAGE_PRESENT); > + if ( tab[offset] & _PAGE_PSE ) > + return &tab[offset]; > + > pt_mfn = pte_to_mfn(tab[offset]); > tab = mfn_to_virt(pt_mfn); > > @@ -524,8 +557,6 @@ int do_map_frames(unsigned long va, > { > pgentry_t *pgt = NULL; > unsigned long done = 0; > - unsigned long i; > - int rc; > > if ( !mfns ) > { > @@ -539,6 +570,9 @@ int do_map_frames(unsigned long va, > memset(err, 0x00, n * sizeof(int)); > while ( done < n ) > { > +#ifdef CONFIG_PARAVIRT > + unsigned long i; > + int rc; > unsigned long todo; > > if ( err ) > @@ -578,6 +612,17 @@ int do_map_frames(unsigned long va, > } > } > done += todo; > +#else > + if ( !pgt || !(va & L1_MASK) ) > + pgt = need_pgt(va & ~L1_MASK); > + if ( !pgt ) > + return -ENOMEM; > + > + ASSERT(!(*pgt & _PAGE_PSE)); > + pgt[l1_table_offset(va)] = (pgentry_t) > + (((mfns[done * stride] + done * incr) << PAGE_SHIFT) | prot); > + done++; > +#endif > } > > return 0; > @@ -609,16 +654,21 @@ void *map_frames_ex(const unsigned long *mfns, unsigned > long n, > #define UNMAP_BATCH ((STACK_SIZE / 2) / sizeof(multicall_entry_t)) > int unmap_frames(unsigned long va, unsigned long num_frames) > { > +#ifdef CONFIG_PARAVIRT > int n = UNMAP_BATCH; > multicall_entry_t call[n]; > int ret; > int i; > +#else > + pgentry_t *pgt; > +#endif > > ASSERT(!((unsigned long)va & ~PAGE_MASK)); > > DEBUG("va=%p, num=0x%lx\n", va, num_frames); > > while ( num_frames ) { > +#ifdef CONFIG_PARAVIRT > if ( n > num_frames ) > n = num_frames; > > @@ -653,6 +703,17 @@ int unmap_frames(unsigned long va, unsigned long > num_frames) > } > } > num_frames -= n; > +#else > + pgt = get_pgt(va); > + if ( pgt ) > + { > + ASSERT(!(*pgt & _PAGE_PSE)); > + *pgt = 0; > + invlpg(va); > + } > + va += PAGE_SIZE; > + num_frames--; > +#endif > } > return 0; > } > @@ -662,14 +723,24 @@ int unmap_frames(unsigned long va, unsigned long > num_frames) > */ > static void clear_bootstrap(void) > { > +#ifdef CONFIG_PARAVIRT > pte_t nullpte = { }; > int rc; > +#else > + pgentry_t *pgt; > +#endif > > /* Use first page as the CoW zero page */ > memset(&_text, 0, PAGE_SIZE); > mfn_zero = virt_to_mfn((unsigned long) &_text); > +#ifdef CONFIG_PARAVIRT > if ( (rc = HYPERVISOR_update_va_mapping(0, nullpte, UVMF_INVLPG)) ) > printk("Unable to unmap NULL page. rc=%d\n", rc); > +#else > + pgt = get_pgt((unsigned long)&_text); > + *pgt = 0; > + invlpg((unsigned long)&_text); > +#endif > } > > #ifdef CONFIG_PARAVIRT > diff --git a/arch/x86/traps.c b/arch/x86/traps.c > index 2d3222d..aa17da3 100644 > --- a/arch/x86/traps.c > +++ b/arch/x86/traps.c > @@ -121,7 +121,9 @@ void page_walk(unsigned long virt_address) > static int handle_cow(unsigned long addr) { > pgentry_t *tab = pt_base, page; > unsigned long new_page; > +#ifdef CONFIG_PARAVIRT > int rc; > +#endif > > #if defined(__x86_64__) > page = tab[l4_table_offset(addr)]; > @@ -137,6 +139,8 @@ static int handle_cow(unsigned long addr) { > page = tab[l2_table_offset(addr)]; > if (!(page & _PAGE_PRESENT)) > return 0; > + if ( page & _PAGE_PSE ) > + return 0; > tab = pte_to_virt(page); > > page = tab[l1_table_offset(addr)]; > @@ -149,12 +153,18 @@ static int handle_cow(unsigned long addr) { > new_page = alloc_pages(0); > memset((void*) new_page, 0, PAGE_SIZE); > > +#ifdef CONFIG_PARAVIRT > rc = HYPERVISOR_update_va_mapping(addr & PAGE_MASK, > __pte(virt_to_mach(new_page) | L1_PROT), UVMF_INVLPG); > if (!rc) > return 1; > > printk("Map zero page to %lx failed: %d.\n", addr, rc); > return 0; > +#else > + tab[l1_table_offset(addr)] = virt_to_mach(new_page) | L1_PROT; > + invlpg(addr); > + return 1; > +#endif > } > > static void do_stack_walk(unsigned long frame_base) > diff --git a/include/x86/arch_mm.h b/include/x86/arch_mm.h > index 28ab406..e0ae552 100644 > --- a/include/x86/arch_mm.h > +++ b/include/x86/arch_mm.h > @@ -78,6 +78,8 @@ > #define L2_PAGETABLE_ENTRIES 512 > #define L3_PAGETABLE_ENTRIES 4 > > +#define PAGETABLE_LEVELS 3 > + > #define PADDR_BITS 44 > #define PADDR_MASK ((1ULL << PADDR_BITS)-1) > > @@ -110,6 +112,8 @@ typedef uint64_t pgentry_t; > #define L3_PAGETABLE_ENTRIES 512 > #define L4_PAGETABLE_ENTRIES 512 > > +#define PAGETABLE_LEVELS 4 > + > /* These are page-table limitations. Current CPUs support only 40-bit phys. > */ > #define PADDR_BITS 52 > #define VADDR_BITS 48 > diff --git a/include/x86/os.h b/include/x86/os.h > index 1083328..20cc27f 100644 > --- a/include/x86/os.h > +++ b/include/x86/os.h > @@ -206,6 +206,15 @@ static inline int irqs_disabled(void) > */ > typedef struct { volatile int counter; } atomic_t; > > +static inline void write_cr3(unsigned long cr3) > +{ > + asm volatile( "mov %0, %%cr3" : : "r" (cr3) : "memory" ); > +} > + > +static inline void invlpg(unsigned long va) > +{ > + asm volatile ( "invlpg %0": : "m" (*(const char *)(va)) : "memory" ); > +} > > /************************** i386 *******************************/ > #ifdef __INSIDE_MINIOS__ > -- > 2.6.6 > -- Samuel /* Amuse the user. */ printk( " \\|/ ____ \\|/\n" " \"@'/ ,. \\`@\"\n" " /_| \\__/ |_\\\n" " \\__U_/\n"); (From linux/arch/sparc/kernel/traps.c:die_if_kernel()) _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx https://lists.xen.org/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |