[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 12/22] mini-os: add x86 native page table handling



Juergen Gross, on Tue 23 Aug 2016 17:15:58 +0200, wrote:
> For support of HVMlite don't use mmu_update hypercalls, but write the
> page table entries directly.
> 
> Signed-off-by: Juergen Gross <jgross@xxxxxxxx>

Reviewed-by: Samuel Thibault <samuel.thibault@xxxxxxxxxxxx>

> ---
>  arch/x86/mm.c         | 147 
> +++++++++++++++++++++++++++++++++++++-------------
>  arch/x86/traps.c      |  10 ++++
>  include/x86/arch_mm.h |   4 ++
>  include/x86/os.h      |   9 ++++
>  4 files changed, 132 insertions(+), 38 deletions(-)
> 
> diff --git a/arch/x86/mm.c b/arch/x86/mm.c
> index cbb5617..f5248a4 100644
> --- a/arch/x86/mm.c
> +++ b/arch/x86/mm.c
> @@ -123,16 +123,25 @@ void arch_mm_preinit(void *p)
>   * table at offset in previous level MFN (pref_l_mfn). pt_pfn is a guest
>   * PFN.
>   */
> +static pgentry_t pt_prot[PAGETABLE_LEVELS] = {
> +    L1_PROT,
> +    L2_PROT,
> +    L3_PROT,
> +#if defined(__x86_64__)
> +    L4_PROT,
> +#endif
> +};
> +
>  static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, 
>                           unsigned long offset, unsigned long level)
>  {   
> -    pgentry_t *tab = pt_base;
> +    pgentry_t *tab;
>      unsigned long pt_page = (unsigned long)pfn_to_virt(*pt_pfn); 
> -    pgentry_t prot_e, prot_t;
> +#ifdef CONFIG_PARAVIRT
>      mmu_update_t mmu_updates[1];
>      int rc;
> +#endif
>      
> -    prot_e = prot_t = 0;
>      DEBUG("Allocating new L%d pt frame for pfn=%lx, "
>            "prev_l_mfn=%lx, offset=%lx", 
>            level, *pt_pfn, prev_l_mfn, offset);
> @@ -140,30 +149,12 @@ static void new_pt_frame(unsigned long *pt_pfn, 
> unsigned long prev_l_mfn,
>      /* We need to clear the page, otherwise we might fail to map it
>         as a page table page */
>      memset((void*) pt_page, 0, PAGE_SIZE);  
> - 
> -    switch ( level )
> -    {
> -    case L1_FRAME:
> -        prot_e = L1_PROT;
> -        prot_t = L2_PROT;
> -        break;
> -    case L2_FRAME:
> -        prot_e = L2_PROT;
> -        prot_t = L3_PROT;
> -        break;
> -#if defined(__x86_64__)
> -    case L3_FRAME:
> -        prot_e = L3_PROT;
> -        prot_t = L4_PROT;
> -        break;
> -#endif
> -    default:
> -        printk("new_pt_frame() called with invalid level number %lu\n", 
> level);
> -        do_exit();
> -        break;
> -    }
>  
> +    ASSERT(level >= 1 && level <= PAGETABLE_LEVELS);
> +
> +#ifdef CONFIG_PARAVIRT
>      /* Make PFN a page table page */
> +    tab = pt_base;
>  #if defined(__x86_64__)
>      tab = pte_to_virt(tab[l4_table_offset(pt_page)]);
>  #endif
> @@ -172,7 +163,7 @@ static void new_pt_frame(unsigned long *pt_pfn, unsigned 
> long prev_l_mfn,
>      mmu_updates[0].ptr = (tab[l2_table_offset(pt_page)] & PAGE_MASK) + 
>          sizeof(pgentry_t) * l1_table_offset(pt_page);
>      mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | 
> -        (prot_e & ~_PAGE_RW);
> +        (pt_prot[level - 1] & ~_PAGE_RW);
>      
>      if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 )
>      {
> @@ -184,13 +175,18 @@ static void new_pt_frame(unsigned long *pt_pfn, 
> unsigned long prev_l_mfn,
>      /* Hook the new page table page into the hierarchy */
>      mmu_updates[0].ptr =
>          ((pgentry_t)prev_l_mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset;
> -    mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | 
> prot_t;
> +    mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT |
> +        pt_prot[level];
>  
>      if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 
> ) 
>      {
>          printk("ERROR: mmu_update failed with rc=%d\n", rc);
>          do_exit();
>      }
> +#else
> +    tab = mfn_to_virt(prev_l_mfn);
> +    tab[offset] = (*pt_pfn << PAGE_SHIFT) | pt_prot[level];
> +#endif
>  
>      *pt_pfn += 1;
>  }
> @@ -202,12 +198,14 @@ static void build_pagetable(unsigned long *start_pfn, 
> unsigned long *max_pfn)
>  {
>      unsigned long start_address, end_address;
>      unsigned long pfn_to_map, pt_pfn = *start_pfn;
> -    static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1];
>      pgentry_t *tab = pt_base, page;
>      unsigned long pt_mfn = pfn_to_mfn(virt_to_pfn(pt_base));
>      unsigned long offset;
> +#ifdef CONFIG_PARAVIRT
> +    static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1];
>      int count = 0;
>      int rc;
> +#endif
>  
>      /* Be conservative: even if we know there will be more pages already
>         mapped, start the loop at the very beginning. */
> @@ -225,6 +223,10 @@ static void build_pagetable(unsigned long *start_pfn, 
> unsigned long *max_pfn)
>                 ((unsigned long)pfn_to_virt(*max_pfn) - 
>                  (unsigned long)&_text)>>20);
>      }
> +#else
> +    /* Round up to next 2MB boundary as we are using 2MB pages on HVMlite. */
> +    pfn_to_map = (pfn_to_map + L1_PAGETABLE_ENTRIES - 1) &
> +                 ~(L1_PAGETABLE_ENTRIES - 1);
>  #endif
>  
>      start_address = (unsigned long)pfn_to_virt(pfn_to_map);
> @@ -257,6 +259,7 @@ static void build_pagetable(unsigned long *start_pfn, 
> unsigned long *max_pfn)
>          pt_mfn = pte_to_mfn(page);
>          tab = to_virt(mfn_to_pfn(pt_mfn) << PAGE_SHIFT);
>          offset = l2_table_offset(start_address);        
> +#ifdef CONFIG_PARAVIRT
>          /* Need new L1 pt frame */
>          if ( !(tab[offset] & _PAGE_PRESENT) )
>              new_pt_frame(&pt_pfn, pt_mfn, offset, L1_FRAME);
> @@ -288,6 +291,12 @@ static void build_pagetable(unsigned long *start_pfn, 
> unsigned long *max_pfn)
>              count = 0;
>          }
>          start_address += PAGE_SIZE;
> +#else
> +        if ( !(tab[offset] & _PAGE_PRESENT) )
> +            tab[offset] = (pgentry_t)pfn_to_map << PAGE_SHIFT |
> +                          L2_PROT | _PAGE_PSE;
> +        start_address += 1UL << L2_PAGETABLE_SHIFT;
> +#endif
>      }
>  
>      *start_pfn = pt_pfn;
> @@ -302,16 +311,19 @@ static void set_readonly(void *text, void *etext)
>      unsigned long start_address =
>          ((unsigned long) text + PAGE_SIZE - 1) & PAGE_MASK;
>      unsigned long end_address = (unsigned long) etext;
> -    static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1];
>      pgentry_t *tab = pt_base, page;
>      unsigned long mfn = pfn_to_mfn(virt_to_pfn(pt_base));
>      unsigned long offset;
> +    unsigned long page_size = PAGE_SIZE;
> +#ifdef CONFIG_PARAVIRT
> +    static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1];
>      int count = 0;
>      int rc;
> +#endif
>  
>      printk("setting %p-%p readonly\n", text, etext);
>  
> -    while ( start_address + PAGE_SIZE <= end_address )
> +    while ( start_address + page_size <= end_address )
>      {
>          tab = pt_base;
>          mfn = pfn_to_mfn(virt_to_pfn(pt_base));
> @@ -327,26 +339,34 @@ static void set_readonly(void *text, void *etext)
>          mfn = pte_to_mfn(page);
>          tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT);
>          offset = l2_table_offset(start_address);        
> -        page = tab[offset];
> -        mfn = pte_to_mfn(page);
> -        tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT);
> +        if ( !(tab[offset] & _PAGE_PSE) )
> +        {
> +            page = tab[offset];
> +            mfn = pte_to_mfn(page);
> +            tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT);
>  
> -        offset = l1_table_offset(start_address);
> +            offset = l1_table_offset(start_address);
> +        }
>  
>          if ( start_address != (unsigned long)&shared_info )
>          {
> +#ifdef CONFIG_PARAVIRT
>              mmu_updates[count].ptr = 
>                  ((pgentry_t)mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset;
>              mmu_updates[count].val = tab[offset] & ~_PAGE_RW;
>              count++;
> +#else
> +            tab[offset] &= ~_PAGE_RW;
> +#endif
>          }
>          else
>              printk("skipped %lx\n", start_address);
>  
> -        start_address += PAGE_SIZE;
> +        start_address += page_size;
>  
> +#ifdef CONFIG_PARAVIRT
>          if ( count == L1_PAGETABLE_ENTRIES || 
> -             start_address + PAGE_SIZE > end_address )
> +             start_address + page_size > end_address )
>          {
>              rc = HYPERVISOR_mmu_update(mmu_updates, count, NULL, DOMID_SELF);
>              if ( rc < 0 )
> @@ -356,8 +376,13 @@ static void set_readonly(void *text, void *etext)
>              }
>              count = 0;
>          }
> +#else
> +        if ( start_address == (1UL << L2_PAGETABLE_SHIFT) )
> +            page_size = 1UL << L2_PAGETABLE_SHIFT;
> +#endif
>      }
>  
> +#ifdef CONFIG_PARAVIRT
>      {
>          mmuext_op_t op = {
>              .cmd = MMUEXT_TLB_FLUSH_ALL,
> @@ -365,6 +390,9 @@ static void set_readonly(void *text, void *etext)
>          int count;
>          HYPERVISOR_mmuext_op(&op, 1, &count, DOMID_SELF);
>      }
> +#else
> +    write_cr3((unsigned long)pt_base);
> +#endif
>  }
>  
>  /*
> @@ -394,6 +422,8 @@ static pgentry_t *get_pgt(unsigned long va)
>      offset = l2_table_offset(va);
>      if ( !(tab[offset] & _PAGE_PRESENT) )
>          return NULL;
> +    if ( tab[offset] & _PAGE_PSE )
> +        return &tab[offset];
>      mfn = pte_to_mfn(tab[offset]);
>      tab = mfn_to_virt(mfn);
>      offset = l1_table_offset(va);
> @@ -448,6 +478,9 @@ pgentry_t *need_pgt(unsigned long va)
>          new_pt_frame(&pt_pfn, pt_mfn, offset, L1_FRAME);
>      }
>      ASSERT(tab[offset] & _PAGE_PRESENT);
> +    if ( tab[offset] & _PAGE_PSE )
> +        return &tab[offset];
> +
>      pt_mfn = pte_to_mfn(tab[offset]);
>      tab = mfn_to_virt(pt_mfn);
>  
> @@ -524,8 +557,6 @@ int do_map_frames(unsigned long va,
>  {
>      pgentry_t *pgt = NULL;
>      unsigned long done = 0;
> -    unsigned long i;
> -    int rc;
>  
>      if ( !mfns ) 
>      {
> @@ -539,6 +570,9 @@ int do_map_frames(unsigned long va,
>          memset(err, 0x00, n * sizeof(int));
>      while ( done < n )
>      {
> +#ifdef CONFIG_PARAVIRT
> +        unsigned long i;
> +        int rc;
>          unsigned long todo;
>  
>          if ( err )
> @@ -578,6 +612,17 @@ int do_map_frames(unsigned long va,
>              }
>          }
>          done += todo;
> +#else
> +        if ( !pgt || !(va & L1_MASK) )
> +            pgt = need_pgt(va & ~L1_MASK);
> +        if ( !pgt )
> +            return -ENOMEM;
> +
> +        ASSERT(!(*pgt & _PAGE_PSE));
> +        pgt[l1_table_offset(va)] = (pgentry_t)
> +            (((mfns[done * stride] + done * incr) << PAGE_SHIFT) | prot);
> +        done++;
> +#endif
>      }
>  
>      return 0;
> @@ -609,16 +654,21 @@ void *map_frames_ex(const unsigned long *mfns, unsigned 
> long n,
>  #define UNMAP_BATCH ((STACK_SIZE / 2) / sizeof(multicall_entry_t))
>  int unmap_frames(unsigned long va, unsigned long num_frames)
>  {
> +#ifdef CONFIG_PARAVIRT
>      int n = UNMAP_BATCH;
>      multicall_entry_t call[n];
>      int ret;
>      int i;
> +#else
> +    pgentry_t *pgt;
> +#endif
>  
>      ASSERT(!((unsigned long)va & ~PAGE_MASK));
>  
>      DEBUG("va=%p, num=0x%lx\n", va, num_frames);
>  
>      while ( num_frames ) {
> +#ifdef CONFIG_PARAVIRT
>          if ( n > num_frames )
>              n = num_frames;
>  
> @@ -653,6 +703,17 @@ int unmap_frames(unsigned long va, unsigned long 
> num_frames)
>              }
>          }
>          num_frames -= n;
> +#else
> +        pgt = get_pgt(va);
> +        if ( pgt )
> +        {
> +            ASSERT(!(*pgt & _PAGE_PSE));
> +            *pgt = 0;
> +            invlpg(va);
> +        }
> +        va += PAGE_SIZE;
> +        num_frames--;
> +#endif
>      }
>      return 0;
>  }
> @@ -662,14 +723,24 @@ int unmap_frames(unsigned long va, unsigned long 
> num_frames)
>   */
>  static void clear_bootstrap(void)
>  {
> +#ifdef CONFIG_PARAVIRT
>      pte_t nullpte = { };
>      int rc;
> +#else
> +    pgentry_t *pgt;
> +#endif
>  
>      /* Use first page as the CoW zero page */
>      memset(&_text, 0, PAGE_SIZE);
>      mfn_zero = virt_to_mfn((unsigned long) &_text);
> +#ifdef CONFIG_PARAVIRT
>      if ( (rc = HYPERVISOR_update_va_mapping(0, nullpte, UVMF_INVLPG)) )
>          printk("Unable to unmap NULL page. rc=%d\n", rc);
> +#else
> +    pgt = get_pgt((unsigned long)&_text);
> +    *pgt = 0;
> +    invlpg((unsigned long)&_text);
> +#endif
>  }
>  
>  #ifdef CONFIG_PARAVIRT
> diff --git a/arch/x86/traps.c b/arch/x86/traps.c
> index 2d3222d..aa17da3 100644
> --- a/arch/x86/traps.c
> +++ b/arch/x86/traps.c
> @@ -121,7 +121,9 @@ void page_walk(unsigned long virt_address)
>  static int handle_cow(unsigned long addr) {
>          pgentry_t *tab = pt_base, page;
>       unsigned long new_page;
> +#ifdef CONFIG_PARAVIRT
>       int rc;
> +#endif
>  
>  #if defined(__x86_64__)
>          page = tab[l4_table_offset(addr)];
> @@ -137,6 +139,8 @@ static int handle_cow(unsigned long addr) {
>          page = tab[l2_table_offset(addr)];
>       if (!(page & _PAGE_PRESENT))
>           return 0;
> +     if ( page & _PAGE_PSE )
> +         return 0;
>          tab = pte_to_virt(page);
>          
>          page = tab[l1_table_offset(addr)];
> @@ -149,12 +153,18 @@ static int handle_cow(unsigned long addr) {
>       new_page = alloc_pages(0);
>       memset((void*) new_page, 0, PAGE_SIZE);
>  
> +#ifdef CONFIG_PARAVIRT
>       rc = HYPERVISOR_update_va_mapping(addr & PAGE_MASK, 
> __pte(virt_to_mach(new_page) | L1_PROT), UVMF_INVLPG);
>       if (!rc)
>               return 1;
>  
>       printk("Map zero page to %lx failed: %d.\n", addr, rc);
>       return 0;
> +#else
> +     tab[l1_table_offset(addr)] = virt_to_mach(new_page) | L1_PROT;
> +     invlpg(addr);
> +     return 1;
> +#endif
>  }
>  
>  static void do_stack_walk(unsigned long frame_base)
> diff --git a/include/x86/arch_mm.h b/include/x86/arch_mm.h
> index 28ab406..e0ae552 100644
> --- a/include/x86/arch_mm.h
> +++ b/include/x86/arch_mm.h
> @@ -78,6 +78,8 @@
>  #define L2_PAGETABLE_ENTRIES    512
>  #define L3_PAGETABLE_ENTRIES    4
>  
> +#define PAGETABLE_LEVELS        3
> +
>  #define PADDR_BITS              44
>  #define PADDR_MASK              ((1ULL << PADDR_BITS)-1)
>  
> @@ -110,6 +112,8 @@ typedef uint64_t pgentry_t;
>  #define L3_PAGETABLE_ENTRIES    512
>  #define L4_PAGETABLE_ENTRIES    512
>  
> +#define PAGETABLE_LEVELS        4
> +
>  /* These are page-table limitations. Current CPUs support only 40-bit phys. 
> */
>  #define PADDR_BITS              52
>  #define VADDR_BITS              48
> diff --git a/include/x86/os.h b/include/x86/os.h
> index 1083328..20cc27f 100644
> --- a/include/x86/os.h
> +++ b/include/x86/os.h
> @@ -206,6 +206,15 @@ static inline int irqs_disabled(void)
>   */
>  typedef struct { volatile int counter; } atomic_t;
>  
> +static inline void write_cr3(unsigned long cr3)
> +{
> +    asm volatile( "mov %0, %%cr3" : : "r" (cr3) : "memory" );
> +}
> +
> +static inline void invlpg(unsigned long va)
> +{
> +    asm volatile ( "invlpg %0": : "m" (*(const char *)(va)) : "memory" );
> +}
>  
>  /************************** i386 *******************************/
>  #ifdef __INSIDE_MINIOS__
> -- 
> 2.6.6
> 

-- 
Samuel
        /* Amuse the user. */
        printk(
"              \\|/ ____ \\|/\n"
"              \"@'/ ,. \\`@\"\n"
"              /_| \\__/ |_\\\n"
"                 \\__U_/\n");
(From linux/arch/sparc/kernel/traps.c:die_if_kernel())

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.