[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-ia64-devel] [PATCH][RFC] per vcpu VHPT



I sent out the old patches. sorry for that.
attached the newest one. Please discard old ones.

On Mon, Jul 24, 2006 at 09:54:28PM +0900, Isaku Yamahata wrote:
> Hi.
> 
> I implemented per vcpu VHPT for non-VTi domain.
> The motivation is to alleviate vcpu migration cost between physical cpus
> with credit scheduler.
> If more than one vcpu of same domain, VHPT needs to be flushed every
> vcpu switch. I'd like to avoid this scenario.
> The patch is for discussion and performance evaluation. Not for commit. 
> 
> 
> I checked the mailing list archives and found the thread
> Xen/ia64 - global or per VP VHPT
> http://lists.xensource.com/archives/html/xen-devel/2005-04/msg01002.html
> 
> The conclustion at that time isn't concluded. 
> (At least my understanding. Because the thread was very long to follow. 
> So I might be wrong, correct me.)
> With this patch, we can measure the performance and descide to include
> this patch or discard the idea.
> 
> 
> This patch introduces compile time optoin,  xen_ia64_pervcpu_vhpt=y,
> to enable this feature and xen boot time option,  pervcpu_vhpt=0
> to disable per vcpu vhpt allocation.
> The patch depends on tlb tracking patch which I sent before.
> I attached these patches for convinience.
> 
> Thanks
> -- 
> yamahata

> # HG changeset patch
> # User yamahata@xxxxxxxxxxxxx
> # Node ID c654d462c4481685fb2e803e41cb2beba56bee4b
> # Parent  b2abc70be89e02d0d380674096c8c1fb9e552431
> import linux/include/linux/hash.h.
> PATCHNAME: import_linux_hash.h
> 
> Signed-off-by: Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
> 
> diff -r b2abc70be89e -r c654d462c448 xen/include/asm-ia64/linux/README.origin
> --- a/xen/include/asm-ia64/linux/README.origin        Wed Jul 19 07:17:54 
> 2006 -0600
> +++ b/xen/include/asm-ia64/linux/README.origin        Mon Jul 24 21:34:37 
> 2006 +0900
> @@ -8,6 +8,7 @@ bitmap.h              -> linux/include/linux/bitmap.
>  bitmap.h             -> linux/include/linux/bitmap.h
>  bitops.h             -> linux/include/linux/bitops.h
>  initrd.h             -> linux/include/linux/initrd.h
> +hash.h                       -> linux/include/linux/hash.h
>  jiffies.h            -> linux/include/linux/jiffies.h
>  kmalloc_sizes.h              -> linux/include/linux/kmalloc_sizes.h
>  linkage.h            -> linux/include/linux/linkage.h
> diff -r b2abc70be89e -r c654d462c448 xen/include/asm-ia64/linux/hash.h
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/include/asm-ia64/linux/hash.h       Mon Jul 24 21:34:37 2006 +0900
> @@ -0,0 +1,58 @@
> +#ifndef _LINUX_HASH_H
> +#define _LINUX_HASH_H
> +/* Fast hashing routine for a long.
> +   (C) 2002 William Lee Irwin III, IBM */
> +
> +/*
> + * Knuth recommends primes in approximately golden ratio to the maximum
> + * integer representable by a machine word for multiplicative hashing.
> + * Chuck Lever verified the effectiveness of this technique:
> + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
> + *
> + * These primes are chosen to be bit-sparse, that is operations on
> + * them can use shifts and additions instead of multiplications for
> + * machines where multiplications are slow.
> + */
> +#if BITS_PER_LONG == 32
> +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
> +#define GOLDEN_RATIO_PRIME 0x9e370001UL
> +#elif BITS_PER_LONG == 64
> +/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
> +#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
> +#else
> +#error Define GOLDEN_RATIO_PRIME for your wordsize.
> +#endif
> +
> +static inline unsigned long hash_long(unsigned long val, unsigned int bits)
> +{
> +     unsigned long hash = val;
> +
> +#if BITS_PER_LONG == 64
> +     /*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
> +     unsigned long n = hash;
> +     n <<= 18;
> +     hash -= n;
> +     n <<= 33;
> +     hash -= n;
> +     n <<= 3;
> +     hash += n;
> +     n <<= 3;
> +     hash -= n;
> +     n <<= 4;
> +     hash += n;
> +     n <<= 2;
> +     hash += n;
> +#else
> +     /* On some cpus multiply is faster, on others gcc will do shifts */
> +     hash *= GOLDEN_RATIO_PRIME;
> +#endif
> +
> +     /* High bits are more random, so use them. */
> +     return hash >> (BITS_PER_LONG - bits);
> +}
> +     
> +static inline unsigned long hash_ptr(void *ptr, unsigned int bits)
> +{
> +     return hash_long((unsigned long)ptr, bits);
> +}
> +#endif /* _LINUX_HASH_H */

> # HG changeset patch
> # User yamahata@xxxxxxxxxxxxx
> # Node ID cb0aa2b2e180d76d09592ed32338f9cb4ac5b7a0
> # Parent  c654d462c4481685fb2e803e41cb2beba56bee4b
> add tlb insert tracking to do vTLB flush finer grained virtual address
> range when a page is unmapped from a domain.
> This is functionality is enabled with a compile time option,
> xen_ia64_tlb_track=y.
> PATCHNAME: tlb_track
> 
> Signed-off-by: Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
> 
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/Rules.mk
> --- a/xen/arch/ia64/Rules.mk  Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/Rules.mk  Mon Jul 24 21:35:16 2006 +0900
> @@ -39,6 +39,9 @@ ifeq ($(xen_ia64_dom0_virtual_physical),
>  ifeq ($(xen_ia64_dom0_virtual_physical),y)
>  CFLAGS       += -DCONFIG_XEN_IA64_DOM0_VP
>  endif
> +ifeq ($(xen_ia64_tlb_track),y)
> +CFLAGS       += -DCONFIG_XEN_IA64_TLB_TRACK
> +endif
>  ifeq ($(no_warns),y)
>  CFLAGS       += -Wa,--fatal-warnings -Werror -Wno-uninitialized
>  endif
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/Makefile
> --- a/xen/arch/ia64/xen/Makefile      Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/Makefile      Mon Jul 24 21:35:16 2006 +0900
> @@ -27,3 +27,4 @@ obj-y += privop_stat.o
>  obj-y += privop_stat.o
>  
>  obj-$(crash_debug) += gdbstub.o
> +obj-$(xen_ia64_tlb_track) += tlb_track.o
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/domain.c
> --- a/xen/arch/ia64/xen/domain.c      Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/domain.c      Mon Jul 24 21:35:16 2006 +0900
> @@ -60,6 +60,9 @@
>  #include <asm/regionreg.h>
>  #include <asm/dom_fw.h>
>  #include <asm/privop_stat.h>
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +#include <asm/tlb_track.h>
> +#endif
>  
>  #ifndef CONFIG_XEN_IA64_DOM0_VP
>  #define CONFIG_DOMAIN0_CONTIGUOUS
> @@ -351,6 +354,10 @@ int arch_domain_create(struct domain *d)
>       if (is_idle_domain(d))
>           return 0;
>  
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +     if (tlb_track_create(d) < 0)
> +       goto fail_nomem;
> +#endif
>       d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
>       if (d->shared_info == NULL)
>           goto fail_nomem;
> @@ -389,6 +396,9 @@ void arch_domain_destroy(struct domain *
>       if (d->shared_info != NULL)
>           free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
>  
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +     tlb_track_destroy(d);
> +#endif
>       domain_flush_destroy (d);
>  
>       deallocate_rid_range(d);
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/faults.c
> --- a/xen/arch/ia64/xen/faults.c      Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/faults.c      Mon Jul 24 21:35:16 2006 +0900
> @@ -27,6 +27,7 @@
>  #include <asm/debugger.h>
>  #include <asm/fpswa.h>
>  #include <asm/bundle.h>
> +#include <asm/p2m_entry.h>
>  #include <asm/privop_stat.h>
>  #include <asm/asm-xsi-offsets.h>
>  
> @@ -202,8 +203,15 @@ void ia64_do_page_fault (unsigned long a
>       fault = vcpu_translate(current,address,is_data,&pteval,&itir,&iha);
>       if (fault == IA64_NO_FAULT || fault == IA64_USE_TLB) {
>               struct p2m_entry entry;
> -             pteval = translate_domain_pte(pteval, address, itir, &logps, 
> &entry);
> -             vcpu_itc_no_srlz(current,is_data?2:1,address,pteval,-1UL,logps);
> +             unsigned long m_pteval;
> +             m_pteval = translate_domain_pte(pteval, address, itir, &logps, 
> &entry);
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
> +             vcpu_itc_no_srlz(current, (is_data? 2: 1) | 4, 
> +                              address, m_pteval, pteval, logps);
> +#else
> +             vcpu_itc_no_srlz(current, (is_data? 2: 1) | 4, 
> +                              address, m_pteval, pteval, logps, &entry);
> +#endif
>               if ((fault == IA64_USE_TLB && !current->arch.dtlb.pte.p) ||
>                   p2m_entry_retry(&entry)) {
>                       /* dtlb has been purged in-between.  This dtlb was
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/mm.c
> --- a/xen/arch/ia64/xen/mm.c  Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/mm.c  Mon Jul 24 21:35:16 2006 +0900
> @@ -170,13 +170,14 @@
>  #include <asm/pgalloc.h>
>  #include <asm/vhpt.h>
>  #include <asm/vcpu.h>
> +#include <asm/p2m_entry.h>
>  #include <linux/efi.h>
>  
>  #ifndef CONFIG_XEN_IA64_DOM0_VP
>  #define CONFIG_DOMAIN0_CONTIGUOUS
>  #else
> -static void domain_page_flush(struct domain* d, unsigned long mpaddr,
> -                              unsigned long old_mfn, unsigned long new_mfn);
> +static void domain_page_flush(struct domain* d,
> +                              volatile pte_t* ptep, pte_t old_pte);
>  #endif
>  
>  static struct domain *dom_xen, *dom_io;
> @@ -718,6 +719,19 @@ void *domain_mpa_to_imva(struct domain *
>  }
>  #endif
>  
> +static unsigned long
> +assign_flags_to_pteflags(unsigned long flags)
> +{
> +    unsigned long pteflags =
> +        (flags & ASSIGN_readonly)? _PAGE_AR_R: _PAGE_AR_RWX;
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +    if (flags & ASSIGN_tlb_track) {
> +        pteflags |= _PAGE_TLB_TRACKING;
> +    }
> +#endif
> +    return pteflags;
> +}
> +
>  /* Allocate a new page for domain and map it to the specified metaphysical
>     address.  */
>  static struct page_info *
> @@ -811,7 +825,7 @@ assign_new_domain0_page(struct domain *d
>  }
>  
>  /* map a physical address to the specified metaphysical addr */
> -// flags: currently only ASSIGN_readonly
> +// flags: ASSIGN_xxx 
>  // This is called by assign_domain_mmio_page().
>  // So accessing to pte is racy.
>  void
> @@ -823,13 +837,13 @@ __assign_domain_page(struct domain *d,
>      pte_t old_pte;
>      pte_t new_pte;
>      pte_t ret_pte;
> -    unsigned long arflags = (flags & ASSIGN_readonly)? _PAGE_AR_R: 
> _PAGE_AR_RWX;
> +    unsigned long pteflags = assign_flags_to_pteflags(flags);
>  
>      pte = lookup_alloc_domain_pte(d, mpaddr);
>  
>      old_pte = __pte(0);
>      new_pte = pfn_pte(physaddr >> PAGE_SHIFT,
> -                      __pgprot(__DIRTY_BITS | _PAGE_PL_2 | arflags));
> +                      __pgprot(__DIRTY_BITS | _PAGE_PL_2 | pteflags));
>      ret_pte = ptep_cmpxchg_rel(&d->arch.mm, mpaddr, pte, old_pte, new_pte);
>      if (pte_val(ret_pte) == pte_val(old_pte))
>          smp_mb();
> @@ -945,7 +959,7 @@ assign_domain_mach_page(struct domain *d
>  // caller must call set_gpfn_from_mfn() before call if necessary.
>  // because set_gpfn_from_mfn() result must be visible before pte xchg
>  // caller must use memory barrier. NOTE: xchg has acquire semantics.
> -// flags: currently only ASSIGN_readonly
> +// flags: ASSIGN_xxx
>  static void
>  assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
>                             unsigned long mfn, unsigned long flags)
> @@ -954,11 +968,11 @@ assign_domain_page_replace(struct domain
>      volatile pte_t* pte;
>      pte_t old_pte;
>      pte_t npte;
> -    unsigned long arflags = (flags & ASSIGN_readonly)? _PAGE_AR_R: 
> _PAGE_AR_RWX;
> +    unsigned long pteflags = assign_flags_to_pteflags(flags);
>      pte = lookup_alloc_domain_pte(d, mpaddr);
>  
>      // update pte
> -    npte = pfn_pte(mfn, __pgprot(__DIRTY_BITS | _PAGE_PL_2 | arflags));
> +    npte = pfn_pte(mfn, __pgprot(__DIRTY_BITS | _PAGE_PL_2 | pteflags));
>      old_pte = ptep_xchg(mm, mpaddr, pte, npte);
>      if (pte_mem(old_pte)) {
>          unsigned long old_mfn = pte_pfn(old_pte);
> @@ -978,7 +992,7 @@ assign_domain_page_replace(struct domain
>                  set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
>              }
>  
> -            domain_page_flush(d, mpaddr, old_mfn, mfn);
> +            domain_page_flush(d, pte, old_pte);
>  
>              try_to_clear_PGC_allocate(d, old_page);
>              put_page(old_page);
> @@ -997,29 +1011,29 @@ assign_domain_page_cmpxchg_rel(struct do
>      struct mm_struct *mm = &d->arch.mm;
>      volatile pte_t* pte;
>      unsigned long old_mfn;
> -    unsigned long old_arflags;
> +    unsigned long old_pteflags;
>      pte_t old_pte;
>      unsigned long new_mfn;
> -    unsigned long new_arflags;
> +    unsigned long new_pteflags;
>      pte_t new_pte;
>      pte_t ret_pte;
>  
>      pte = lookup_alloc_domain_pte(d, mpaddr);
>  
>   again:
> -    old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
> +    old_pteflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
>      old_mfn = page_to_mfn(old_page);
> -    old_pte = pfn_pte(old_mfn, __pgprot(old_arflags));
> +    old_pte = pfn_pte(old_mfn, __pgprot(old_pteflags));
>      if (!pte_present(old_pte)) {
> -        DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx\n",
> -                __func__, pte_val(old_pte), old_arflags, old_mfn);
> +        DPRINTK("%s: old_pte 0x%lx old_pteflags 0x%lx old_mfn 0x%lx\n",
> +                __func__, pte_val(old_pte), old_pteflags, old_mfn);
>          return -EINVAL;
>      }
>  
> -    new_arflags = (flags & ASSIGN_readonly)? _PAGE_AR_R: _PAGE_AR_RWX;
> +    new_pteflags = assign_flags_to_pteflags(flags);
>      new_mfn = page_to_mfn(new_page);
>      new_pte = pfn_pte(new_mfn,
> -                      __pgprot(__DIRTY_BITS | _PAGE_PL_2 | new_arflags));
> +                      __pgprot(__DIRTY_BITS | _PAGE_PL_2 | new_pteflags));
>  
>      // update pte
>      ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
> @@ -1028,10 +1042,10 @@ assign_domain_page_cmpxchg_rel(struct do
>              goto again;
>          }
>  
> -        DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx "
> +        DPRINTK("%s: old_pte 0x%lx old_pteflags 0x%lx old_mfn 0x%lx "
>                  "ret_pte 0x%lx ret_mfn 0x%lx\n",
>                  __func__,
> -                pte_val(old_pte), old_arflags, old_mfn,
> +                pte_val(old_pte), old_pteflags, old_mfn,
>                  pte_val(ret_pte), pte_pfn(ret_pte));
>          return -EINVAL;
>      }
> @@ -1043,7 +1057,7 @@ assign_domain_page_cmpxchg_rel(struct do
>  
>      set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
>  
> -    domain_page_flush(d, mpaddr, old_mfn, new_mfn);
> +    domain_page_flush(d, pte, old_pte);
>      put_page(old_page);
>      return 0;
>  }
> @@ -1111,7 +1125,7 @@ zap_domain_page_one(struct domain *d, un
>          set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
>      }
>  
> -    domain_page_flush(d, mpaddr, mfn, INVALID_MFN);
> +    domain_page_flush(d, pte, old_pte);
>  
>      if (page_get_owner(page) != NULL) {
>          try_to_clear_PGC_allocate(d, page);
> @@ -1199,8 +1213,12 @@ create_grant_host_mapping(unsigned long 
>      BUG_ON(ret == 0);
>      BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
>             get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
> -    assign_domain_page_replace(d, gpaddr, mfn, (flags & GNTMAP_readonly)?
> -                                              ASSIGN_readonly: 
> ASSIGN_writable);
> +    assign_domain_page_replace(d, gpaddr, mfn,
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +                               ASSIGN_tlb_track |
> +#endif
> +                               ((flags & GNTMAP_readonly) ?
> +                                ASSIGN_readonly: ASSIGN_writable));
>      return GNTST_okay;
>  }
>  
> @@ -1254,7 +1272,7 @@ destroy_grant_host_mapping(unsigned long
>      }
>      BUG_ON(pte_pfn(old_pte) != mfn);
>  
> -    domain_page_flush(d, gpaddr, mfn, INVALID_MFN);
> +    domain_page_flush(d, pte, old_pte);
>  
>      page = mfn_to_page(mfn);
>      BUG_ON(page_get_owner(page) == d);//try_to_clear_PGC_allocate(d, page) 
> is not needed.
> @@ -1418,11 +1436,38 @@ guest_physmap_remove_page(struct domain 
>  
>  //XXX sledgehammer.
>  //    flush finer range.
> -void
> -domain_page_flush(struct domain* d, unsigned long mpaddr,
> -                  unsigned long old_mfn, unsigned long new_mfn)
> -{
> +static void
> +domain_page_flush(struct domain* d, volatile pte_t* ptep, pte_t old_pte)
> +{
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
>      domain_flush_vtlb_all();
> +#else
> +    struct tlb_track_entry* entry;
> +    switch (tlb_track_search_and_remove(d->arch.tlb_track,
> +                                        ptep, old_pte, &entry)) {
> +    case TLB_TRACK_NOT_TRACKED:
> +        //DPRINTK("%s TLB_TRACK_NOT_TRACKED\n", __func__);
> +        domain_flush_vtlb_all();
> +        break;
> +    case TLB_TRACK_NOT_FOUND:
> +        // do nothing
> +        //DPRINTK("%s TLB_TRACK_NOT_FOUND\n", __func__);
> +        break;
> +    case TLB_TRACK_FOUND:
> +        //DPRINTK("%s TLB_TRACK_FOUND\n", __func__);
> +        domain_flush_vltb_track_entry(d, entry);
> +        tlb_track_free_entry(d->arch.tlb_track, entry);
> +        break;
> +    case TLB_TRACK_MANY:
> +        DPRINTK("%s TLB_TRACK_MANY\n", __func__);
> +        domain_flush_vtlb_all();
> +        break;
> +    case TLB_TRACK_AGAIN:
> +        DPRINTK("%s TLB_TRACK_AGAIN\n", __func__);
> +        BUG();
> +        break;
> +    }
> +#endif
>  }
>  
>  int
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/vcpu.c
> --- a/xen/arch/ia64/xen/vcpu.c        Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/vcpu.c        Mon Jul 24 21:35:16 2006 +0900
> @@ -22,6 +22,7 @@
>  #include <asm/vmx_phy_mode.h>
>  #include <asm/bundle.h>
>  #include <asm/privop_stat.h>
> +#include <asm/p2m_entry.h>
>  
>  /* FIXME: where these declarations should be there ? */
>  extern void getreg(unsigned long regnum, unsigned long *val, int *nat, 
> struct pt_regs *regs);
> @@ -2003,7 +2004,11 @@ IA64FAULT vcpu_set_dtr(VCPU *vcpu, u64 s
>   VCPU translation cache access routines
>  **************************************************************************/
>  
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
>  void vcpu_itc_no_srlz(VCPU *vcpu, UINT64 IorD, UINT64 vaddr, UINT64 pte, 
> UINT64 mp_pte, UINT64 logps)
> +#else
> +void vcpu_itc_no_srlz(VCPU *vcpu, UINT64 IorD, UINT64 vaddr, UINT64 pte, 
> UINT64 mp_pte, UINT64 logps, struct p2m_entry* entry)
> +#endif
>  {
>       unsigned long psr;
>       unsigned long ps = (vcpu->domain==dom0) ? logps : PAGE_SHIFT;
> @@ -2017,6 +2022,9 @@ void vcpu_itc_no_srlz(VCPU *vcpu, UINT64
>  
>  #ifdef CONFIG_XEN_IA64_DOM0_VP
>       BUG_ON(logps > PAGE_SHIFT);
> +#endif
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +     vcpu_tlb_track_insert_or_dirty(vcpu, vaddr, entry);
>  #endif
>       psr = ia64_clear_ic();
>       ia64_itc(IorD,vaddr,pte,ps); // FIXME: look for bigger mappings
> @@ -2035,7 +2043,7 @@ void vcpu_itc_no_srlz(VCPU *vcpu, UINT64
>       // PAGE_SIZE mapping in the vhpt for now, else purging is complicated
>       else vhpt_insert(vaddr,pte,PAGE_SHIFT<<2);
>  #endif
> -     if ((mp_pte == -1UL) || (IorD & 0x4)) // don't place in 1-entry TLB
> +     if (IorD & 0x4) // don't place in 1-entry TLB
>               return;
>       if (IorD & 0x1) {
>               vcpu_set_tr_entry(&PSCBX(vcpu,itlb),mp_pte,ps<<2,vaddr);
> @@ -2060,7 +2068,11 @@ again:
>       pteval = translate_domain_pte(pte, ifa, itir, &logps, &entry);
>       if (!pteval) return IA64_ILLOP_FAULT;
>       if (swap_rr0) set_one_rr(0x0,PSCB(vcpu,rrs[0]));
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
>       vcpu_itc_no_srlz(vcpu,2,ifa,pteval,pte,logps);
> +#else
> +     vcpu_itc_no_srlz(vcpu,2,ifa,pteval,pte,logps,&entry);
> +#endif
>       if (swap_rr0) set_metaphysical_rr0();
>       if (p2m_entry_retry(&entry)) {
>               vcpu_flush_tlb_vhpt_range(ifa, logps);
> @@ -2083,7 +2095,11 @@ again:
>       pteval = translate_domain_pte(pte, ifa, itir, &logps, &entry);
>       if (!pteval) return IA64_ILLOP_FAULT;
>       if (swap_rr0) set_one_rr(0x0,PSCB(vcpu,rrs[0]));
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
>       vcpu_itc_no_srlz(vcpu, 1,ifa,pteval,pte,logps);
> +#else
> +     vcpu_itc_no_srlz(vcpu, 1,ifa,pteval,pte,logps,&entry);
> +#endif
>       if (swap_rr0) set_metaphysical_rr0();
>       if (p2m_entry_retry(&entry)) {
>               vcpu_flush_tlb_vhpt_range(ifa, logps);
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/vhpt.c
> --- a/xen/arch/ia64/xen/vhpt.c        Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/vhpt.c        Mon Jul 24 21:35:16 2006 +0900
> @@ -227,6 +227,48 @@ void domain_flush_vtlb_range (struct dom
>       ia64_global_tlb_purge(vadr,vadr+addr_range,PAGE_SHIFT);
>  }
>  
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +#include <asm/tlb_track.h>
> +void
> +domain_flush_vltb_track_entry(struct domain* d,
> +                              const struct tlb_track_entry* entry)
> +{
> +     unsigned long old_rid;
> +     struct vcpu* v;
> +     int cpu;
> +
> +     //tlb_track_entry_printf(entry);
> +     vcpu_get_rr(current, 0, &old_rid);
> +     vcpu_set_rr(current, 0, entry->rid);
> +    
> +     for_each_vcpu(d, v) {
> +             if (!test_bit(_VCPUF_initialised, &v->vcpu_flags))
> +                     continue;
> +             if (!vcpu_isset(v->vcpu_id, entry->vcpu_dirty_mask))
> +                     continue;
> +
> +             /* Purge TC entries.
> +                FIXME: clear only if match.  */
> +             vcpu_purge_tr_entry(&PSCBX(v, dtlb));
> +             vcpu_purge_tr_entry(&PSCBX(v, itlb));
> +     }
> +     smp_mb();
> +
> +     for_each_cpu_mask(cpu, entry->pcpu_dirty_mask) {
> +             //printk("%s:%d cpu %d\n", __func__, __LINE__, cpu);
> +             /* Invalidate VHPT entries.  */
> +             cpu_flush_vhpt_range(cpu, entry->vaddr, PAGE_SIZE);
> +     }
> +     // ptc.ga has release semantics.
> +
> +     /* ptc.ga  */
> +     ia64_global_tlb_purge(entry->vaddr, entry->vaddr + PAGE_SIZE,
> +                           PAGE_SHIFT);
> +
> +     vcpu_set_rr(current, 0, old_rid);
> +}
> +#endif
> +
>  static void flush_tlb_vhpt_all (struct domain *d)
>  {
>       /* First VHPT.  */
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/asm-ia64/domain.h
> --- a/xen/include/asm-ia64/domain.h   Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/include/asm-ia64/domain.h   Mon Jul 24 21:35:16 2006 +0900
> @@ -12,28 +12,10 @@
>  #include <xen/cpumask.h>
>  #include <asm/fpswa.h>
>  
> -struct p2m_entry {
> -    volatile pte_t*     pte;
> -    pte_t               used;
> -};
> -
> -static inline void
> -p2m_entry_set(struct p2m_entry* entry, volatile pte_t* pte, pte_t used)
> -{
> -    entry->pte  = pte;
> -    entry->used = used;
> -}
> -
> -static inline int
> -p2m_entry_retry(struct p2m_entry* entry)
> -{
> -    //XXX see lookup_domain_pte().
> -    //    NULL is set for invalid gpaddr for the time being.
> -    if (entry->pte == NULL)
> -        return 0;
> -
> -    return (pte_val(*entry->pte) != pte_val(entry->used));
> -}
> +struct p2m_entry;
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +struct tlb_track;
> +#endif
>  
>  extern void domain_relinquish_resources(struct domain *);
>  
> @@ -118,6 +100,10 @@ struct arch_domain {
>      void *fpswa_inf;
>  
>      struct last_vcpu last_vcpu[NR_CPUS];
> +
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +    struct tlb_track*   tlb_track;
> +#endif
>  };
>  #define INT_ENABLE_OFFSET(v)                   \
>      (sizeof(vcpu_info_t) * (v)->vcpu_id + \
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/asm-ia64/tlbflush.h
> --- a/xen/include/asm-ia64/tlbflush.h Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/include/asm-ia64/tlbflush.h Mon Jul 24 21:35:16 2006 +0900
> @@ -22,6 +22,13 @@ void domain_flush_vtlb_all (void);
>  /* Global range-flush of vTLB.  */
>  void domain_flush_vtlb_range (struct domain *d, u64 vadr, u64 addr_range);
>  
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +struct tlb_track_entry;
> +/* Global entry-flush of vTLB */
> +void domain_flush_vltb_track_entry(struct domain* d,
> +                                const struct tlb_track_entry* entry);
> +#endif
> +
>  /* Final vTLB flush on every dirty cpus.  */
>  void domain_flush_destroy (struct domain *d);
>  
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/asm-ia64/vcpu.h
> --- a/xen/include/asm-ia64/vcpu.h     Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/include/asm-ia64/vcpu.h     Mon Jul 24 21:35:16 2006 +0900
> @@ -158,7 +158,12 @@ extern void vcpu_set_next_timer(VCPU *vc
>  extern void vcpu_set_next_timer(VCPU *vcpu);
>  extern BOOLEAN vcpu_timer_expired(VCPU *vcpu);
>  extern UINT64 vcpu_deliverable_interrupts(VCPU *vcpu);
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
>  extern void vcpu_itc_no_srlz(VCPU *vcpu, UINT64, UINT64, UINT64, UINT64, 
> UINT64);
> +#else
> +struct p2m_entry;
> +extern void vcpu_itc_no_srlz(VCPU *vcpu, UINT64, UINT64, UINT64, UINT64, 
> UINT64, struct p2m_entry*);
> +#endif
>  extern UINT64 vcpu_get_tmp(VCPU *, UINT64);
>  extern void vcpu_set_tmp(VCPU *, UINT64, UINT64);
>  
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/public/arch-ia64.h
> --- a/xen/include/public/arch-ia64.h  Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/include/public/arch-ia64.h  Mon Jul 24 21:35:16 2006 +0900
> @@ -357,8 +357,14 @@ DEFINE_XEN_GUEST_HANDLE(vcpu_guest_conte
>                                                  // address space.
>  // flags for page assignement to pseudo physical address space
>  #define _ASSIGN_readonly                0
> +#define _ASSIGN_tlb_track               1
> +
>  #define ASSIGN_readonly                 (1UL << _ASSIGN_readonly)
>  #define ASSIGN_writable                 (0UL << _ASSIGN_readonly) // dummy 
> flag
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +# define ASSIGN_tlb_track               (1UL << _ASSIGN_tlb_track)
> +#endif
> +
>  
>  /* This structure has the same layout of struct ia64_boot_param, defined in
>     <asm/system.h>.  It is redefined here to ease use.  */
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/tlb_track.c
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/arch/ia64/xen/tlb_track.c   Mon Jul 24 21:35:16 2006 +0900
> @@ -0,0 +1,558 @@
> +/******************************************************************************
> + * tlb_track.h
> + *
> + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> + *
> + */
> +
> +#include <asm/tlb_track.h>
> +#include <asm/p2m_entry.h>
> +#include <asm/vmx_mm_def.h> // for IA64_RR_SHIFT
> +#include <asm/vcpu.h> // for PSCB()
> +
> +#define CONFIG_TLB_TRACK_DEBUG
> +#ifdef CONFIG_TLB_TRACK_DEBUG
> +# define tlb_track_printd(fmt, ...)     \
> +    printf("%s:%d " fmt, __func__, __LINE__, ##__VA_ARGS__)
> +#else
> +# define tlb_track_printd(fmt, ...)     do { } while (0)
> +#endif
> +
> +#define CONFIG_TLB_TRACK_STAT_KEY_HANDLER
> +#ifdef CONFIG_TLB_TRACK_STAT_KEY_HANDLER
> +#include <asm/regs.h>
> +#include <xen/keyhandler.h>
> +
> +static void
> +dump_tlb_track_stat(unsigned char key)
> +{
> +    tlb_track_stat_printf(&dom0->arch.tlb_track->stat);
> +}
> +#endif
> +
> +static int
> +tlb_track_allocate_entries(struct tlb_track* tlb_track)
> +{
> +    struct page_info* entry_page;
> +    struct tlb_track_entry* track_entries;
> +    unsigned int allocated;
> +    unsigned long i;
> +
> +    BUG_ON(tlb_track->num_free > 0);
> +    if (tlb_track->num_entries >= tlb_track->limit) {
> +        DPRINTK("%s: num_entries %d limit %d\n",
> +                __func__, tlb_track->num_entries, tlb_track->limit);
> +        return -ENOMEM;
> +    }
> +    entry_page = alloc_domheap_page(NULL);
> +    if (entry_page == NULL) {
> +        DPRINTK("%s: domheap page failed. num_entries %d limit %d\n",
> +                __func__, tlb_track->num_entries, tlb_track->limit);
> +        return -ENOMEM;
> +    }
> +
> +    list_add(&entry_page->list, &tlb_track->page_list);
> +    track_entries = (struct tlb_track_entry*)page_to_virt(entry_page);
> +    allocated = PAGE_SIZE / sizeof(track_entries[0]);
> +    tlb_track->num_entries += allocated;
> +    tlb_track->num_free += allocated;
> +    for (i = 0; i < allocated; i++) {
> +        list_add(&track_entries[i].list, &tlb_track->free_list);
> +        //tlb_track_printd("track_entries[%ld] 0x%p\n", i, 
> &track_entries[i]);
> +    }
> +    tlb_track_printd("allocated %d num_entries %d num_free %d\n",
> +                     allocated, tlb_track->num_entries, tlb_track->num_free);
> +    return 0;
> +}
> +
> +
> +int
> +tlb_track_create(struct domain* d)
> +{
> +    struct tlb_track* tlb_track = NULL;
> +    struct page_info* hash_page = NULL;
> +    unsigned int hash_size;
> +    unsigned int hash_shift;
> +    unsigned int i;
> +
> +    tlb_track = xmalloc(struct tlb_track);
> +    if (tlb_track == NULL) {
> +        goto out;
> +    }
> +    hash_page = alloc_domheap_page(NULL);
> +    if (hash_page == NULL) {
> +        goto out;
> +    }
> +
> +    spin_lock_init(&tlb_track->free_list_lock);
> +    INIT_LIST_HEAD(&tlb_track->free_list);
> +    tlb_track->limit = TLB_TRACK_LIMIT_ENTRIES;
> +    tlb_track->num_entries = 0;
> +    tlb_track->num_free = 0;
> +    INIT_LIST_HEAD(&tlb_track->page_list);
> +    if (tlb_track_allocate_entries(tlb_track) < 0) {
> +        goto out;
> +    }
> +
> +    spin_lock_init(&tlb_track->hash_lock);
> +    //XXX hash size optimization
> +    hash_size = PAGE_SIZE / sizeof(tlb_track->hash[0]);
> +    for (hash_shift = 0; (1 << (hash_shift + 1)) < hash_size; hash_shift++)
> +        /* nothing */;
> +    tlb_track->hash_size = (1 << hash_shift);
> +    tlb_track->hash_shift = hash_shift;
> +    tlb_track->hash_mask = (1 << hash_shift) - 1;
> +    tlb_track->hash = page_to_virt(hash_page);
> +    for (i = 0; i < tlb_track->hash_size; i++) {
> +        INIT_LIST_HEAD(&tlb_track->hash[i]);
> +    }
> +
> +    memset(&tlb_track->stat, 0, sizeof(tlb_track->stat));
> +
> +    smp_mb(); // make initialization visible before use.
> +    d->arch.tlb_track = tlb_track;
> +    printk("%s:%d hash 0x%p hash_size %d \n",
> +           __func__, __LINE__, tlb_track->hash, tlb_track->hash_size);
> +
> +#ifdef CONFIG_TLB_TRACK_STAT_KEY_HANDLER
> +    register_keyhandler(
> +                's', dump_tlb_track_stat, "dump dom0 tlb track stats");
> +#endif
> +    return 0;
> +
> +out:
> +    if (hash_page != NULL) {
> +        free_domheap_page(hash_page);
> +    }
> +    if (tlb_track != NULL) {
> +        xfree(tlb_track);
> +    }
> +    return -ENOMEM;
> +}
> +
> +void
> +tlb_track_destroy(struct domain* d)
> +{
> +    struct tlb_track* tlb_track = d->arch.tlb_track;
> +    struct page_info* page;
> +    struct page_info* next;
> +
> +    spin_lock(&tlb_track->free_list_lock);
> +    BUG_ON(tlb_track->num_free != tlb_track->num_entries);
> +
> +    list_for_each_entry_safe(page, next, &tlb_track->page_list, list) {
> +        list_del(&page->list);
> +        free_domheap_page(page);
> +    }
> +
> +    free_domheap_page(virt_to_page(tlb_track->hash));
> +    xfree(tlb_track);
> +    //d->tlb_track = NULL;
> +}
> +
> +static struct tlb_track_entry*
> +tlb_track_get_entry(struct tlb_track* tlb_track)
> +{
> +    struct tlb_track_entry* entry = NULL;
> +    spin_lock(&tlb_track->free_list_lock);
> +    if (tlb_track->num_free == 0) {
> +        (void)tlb_track_allocate_entries(tlb_track);
> +    }
> +    if (tlb_track->num_free > 0) {
> +        BUG_ON(list_empty(&tlb_track->free_list));
> +        entry = list_entry(tlb_track->free_list.next,
> +                           struct tlb_track_entry, list);
> +        tlb_track->num_free--;
> +        list_del(&entry->list);
> +    }
> +    spin_unlock(&tlb_track->free_list_lock);
> +    return entry;
> +}
> +
> +void
> +tlb_track_free_entry(struct tlb_track* tlb_track,
> +                     struct tlb_track_entry* entry)
> +{
> +    spin_lock(&tlb_track->free_list_lock);
> +    list_add(&entry->list, &tlb_track->free_list);
> +    tlb_track->num_free++;
> +    spin_unlock(&tlb_track->free_list_lock);
> +}
> +
> +
> +#include <linux/hash.h>
> +// XXX hash function.
> +static struct list_head*
> +tlb_track_hash_head(struct tlb_track* tlb_track, volatile pte_t* ptep)
> +{
> +    unsigned long hash = hash_long((unsigned long)ptep, 
> tlb_track->hash_shift);
> +    BUG_ON(hash >= tlb_track->hash_size);
> +    BUG_ON((hash & tlb_track->hash_mask) != hash);
> +    return &tlb_track->hash[hash];
> +}
> +
> +static int
> +tlb_track_pte_zapped(pte_t old_pte, pte_t ret_pte)
> +{
> +    if (pte_pfn(old_pte) != pte_pfn(ret_pte) ||
> +        (pte_val(old_pte) & ~(_PFN_MASK | _PAGE_TLB_TRACK_MASK)) !=
> +        (pte_val(ret_pte) & ~(_PFN_MASK | _PAGE_TLB_TRACK_MASK))) {
> +        // Other thread zapped the p2m entry.
> +        return 1;
> +    }
> +    return 0;
> +}
> +
> +static TLB_TRACK_RET_T
> +tlb_track_insert_or_dirty(struct tlb_track* tlb_track, struct mm_struct* mm,
> +                          volatile pte_t* ptep, pte_t old_pte,
> +                          unsigned long vaddr, unsigned long rid)
> +{
> +    unsigned long mfn = pte_pfn(old_pte);
> +    struct list_head* head = tlb_track_hash_head(tlb_track, ptep);
> +    struct tlb_track_entry* entry;
> +    struct tlb_track_entry* new_entry = NULL;
> +    unsigned long bit_to_be_set = _PAGE_TLB_INSERTED;
> +    pte_t new_pte;
> +    pte_t ret_pte;
> +
> +    struct vcpu* v = current;
> +    TLB_TRACK_RET_T ret = TLB_TRACK_NOT_FOUND;
> +
> +    tlb_track->stat.iod++;
> +    if (!pte_tlb_tracking(old_pte)) {
> +        tlb_track->stat.iod_not_tracked++;
> +        return TLB_TRACK_NOT_TRACKED;
> +    }
> +    if (pte_tlb_inserted_many(old_pte)) {
> +        tlb_track->stat.iod_tracked_many++;
> +        return TLB_TRACK_MANY;
> +    }
> +
> +    // vaddr must be normalized so that it is in rr0 and page aligned.
> +    BUG_ON((vaddr >> IA64_RR_SHIFT) != 0);
> +    BUG_ON((vaddr & ~PAGE_MASK) != 0);
> +#if 0
> +    tlb_track_printd("\n"
> +                     "\tmfn 0x%016lx\n"
> +                     "\told_pte 0x%016lx ptep 0x%p\n"
> +                     "\tptep_val 0x%016lx vaddr 0x%016lx rid %ld\n"
> +                     "\ttlb_track 0x%p head 0x%p\n",
> +                     mfn,
> +                     pte_val(old_pte), ptep, pte_val(*ptep),
> +                     vaddr, rid,
> +                     tlb_track, head);
> +#endif
> +
> + again:
> +    // zapping side may zap the p2m entry and then remove tlb track entry
> +    // non-atomically. We may see the stale tlb track entry here.
> +    // p2m_entry_retry() handles such a case.
> +    // Or other thread may zap the p2m entry and remove tlb track entry
> +    // and inserted new tlb track entry.
> +    spin_lock(&tlb_track->hash_lock);
> +    list_for_each_entry(entry, head, list) {
> +        if (entry->ptep != ptep) {
> +            continue;
> +        }
> +
> +        if (pte_pfn(entry->pte_val) == mfn) {
> +            //tlb_track_entry_printf(entry);
> +            if (entry->vaddr == vaddr && entry->rid == rid) {
> +                //tlb_track_printd("TLB_TRACK_FOUND\n");
> +                ret = TLB_TRACK_FOUND;
> +                tlb_track->stat.iod_found++;
> +#ifdef CONFIG_TLB_TRACK_CNT
> +                entry->cnt++;
> +                if (entry->cnt > TLB_TRACK_CNT_FORCE_MANY) {
> +                    // heuristics:
> +                    // If a page is used to transfer data by dev channel,
> +                    // it would be unmapped with small amount access
> +                    // (once or twice tlb insert) after real device
> +                    // I/O completion. It would be short period.
> +                    // However this page seems to be accessed many times.
> +                    // We guess that this page is used I/O ring
> +                    // so that tracking this entry might be useless.
> +                    //tlb_track_entry_printf(entry);
> +                    //tlb_track_printd("cnt = %ld\n", entry->cnt);
> +                    tlb_track->stat.iod_force_many++;
> +                    goto force_many;
> +                }
> +#endif
> +                goto found;
> +            } else {
> +#ifdef CONFIG_TLB_TRACK_CNT
> +            force_many:
> +#endif
> +                if (!pte_tlb_inserted(old_pte)) {
> +                    printk("%s:%d racy update\n", __func__, __LINE__);
> +                    old_pte = __pte(pte_val(old_pte) | _PAGE_TLB_INSERTED);
> +                }
> +                new_pte = __pte(pte_val(old_pte) | _PAGE_TLB_INSERTED_MANY);
> +                ret_pte = ptep_cmpxchg_rel(mm, vaddr, ptep, old_pte, 
> new_pte);
> +                if (pte_val(ret_pte) != pte_val(old_pte)) {
> +                    //tlb_track_printd("TLB_TRACK_AGAIN\n");
> +                    ret = TLB_TRACK_AGAIN;
> +                    tlb_track->stat.iod_again++;
> +                } else {
> +                    //tlb_track_printd("TLB_TRACK_MANY del entry 0x%p\n", 
> entry);
> +                    ret = TLB_TRACK_MANY;
> +                    list_del(&entry->list);
> +                    //tlb_track_entry_printf(entry);
> +                    tlb_track->stat.iod_tracked_many_del++;
> +                }
> +                goto out;
> +            }
> +        }
> +
> +        // Other thread changed the p2m entry and removed and inserted new
> +        // tlb tracn entry after we get old_pte, but before we get
> +        // spinlock.
> +        //tlb_track_printd("TLB_TRACK_AGAIN\n");
> +        ret = TLB_TRACK_AGAIN;
> +        tlb_track->stat.iod_again++;
> +        goto out;
> +    }
> +
> +    entry = NULL; // prevent freeing entry.
> +    if (pte_tlb_inserted(old_pte)) {
> +        // Other thread else removed the tlb_track_entry after we got old_pte
> +        // before we got spin lock.
> +        ret = TLB_TRACK_AGAIN;
> +        tlb_track->stat.iod_again++;
> +        goto out;
> +    }
> +    if (new_entry == NULL && bit_to_be_set == _PAGE_TLB_INSERTED) {
> +        spin_unlock(&tlb_track->hash_lock);
> +        new_entry = tlb_track_get_entry(tlb_track);
> +        if (new_entry == NULL) {
> +            tlb_track_printd("get_entry failed\n");
> +            // entry can't be allocated.
> +            // fall down into full flush mode.
> +            bit_to_be_set |= _PAGE_TLB_INSERTED_MANY;
> +            tlb_track->stat.iod_new_failed++;
> +        }
> +        //tlb_track_printd("new_entry 0x%p\n", new_entry);
> +        tlb_track->stat.iod_new_entry++;
> +        goto again;
> +    }
> +
> +    BUG_ON(pte_tlb_inserted_many(old_pte));
> +    new_pte = __pte(pte_val(old_pte) | bit_to_be_set);
> +    ret_pte = ptep_cmpxchg_rel(mm, vaddr, ptep, old_pte, new_pte);
> +    if (pte_val(old_pte) != pte_val(ret_pte)) {
> +        if (tlb_track_pte_zapped(old_pte, ret_pte)) {
> +            //tlb_track_printd("zapped TLB_TRACK_AGAIN\n");
> +            ret = TLB_TRACK_AGAIN;
> +            tlb_track->stat.iod_again++;
> +            goto out;
> +        }
> +
> +        // Other thread set _PAGE_TLB_INSERTED and/or _PAGE_TLB_INSERTED_MANY
> +        if (pte_tlb_inserted_many(ret_pte)) {
> +            // Other thread already set _PAGE_TLB_INSERTED_MANY and
> +            // removed the entry.
> +            //tlb_track_printd("iserted TLB_TRACK_MANY\n");
> +            BUG_ON(!pte_tlb_inserted(ret_pte));
> +            ret = TLB_TRACK_MANY;
> +            tlb_track->stat.iod_new_many++;
> +            goto out;
> +        }
> +        BUG_ON(pte_tlb_inserted(ret_pte));
> +        BUG();
> +    }
> +    if (new_entry) {
> +        //tlb_track_printd("iserting new_entry 0x%p\n", new_entry);
> +        entry = new_entry;
> +        new_entry = NULL;
> +
> +        entry->ptep = ptep;
> +        entry->pte_val = old_pte;
> +        entry->vaddr = vaddr;
> +        entry->rid = rid;
> +        cpus_clear(entry->pcpu_dirty_mask);
> +        vcpus_clear(entry->vcpu_dirty_mask);
> +        list_add(&entry->list, head);
> +
> +#ifdef CONFIG_TLB_TRACK_CNT
> +        entry->cnt = 0;
> +#endif
> +        tlb_track->stat.iod_insert++;
> +        //tlb_track_entry_printf(entry);
> +    } else {
> +        goto out;
> +    }
> +
> + found:
> +    BUG_ON(v->processor >= NR_CPUS);
> +    cpu_set(v->processor, entry->pcpu_dirty_mask);
> +    BUG_ON(v->vcpu_id >= NR_CPUS);
> +    vcpu_set(v->vcpu_id, entry->vcpu_dirty_mask);
> +    tlb_track->stat.iod_dirtied++;
> +
> + out:
> +    spin_unlock(&tlb_track->hash_lock);
> +    if (ret == TLB_TRACK_MANY && entry != NULL) {
> +        tlb_track_free_entry(tlb_track, entry);
> +    }
> +    if (new_entry != NULL) {
> +        tlb_track_free_entry(tlb_track, new_entry);
> +    }
> +    return ret;
> +}
> +
> +void
> +vcpu_tlb_track_insert_or_dirty(struct vcpu *vcpu, unsigned long vaddr,
> +                               struct p2m_entry* entry)
> +{
> +    unsigned long vrn = vaddr >> IA64_RR_SHIFT;
> +    unsigned long rid = PSCB(vcpu, rrs[vrn]);
> +    TLB_TRACK_RET_T ret;
> +
> +    vaddr = (vaddr << 3) >> 3;// mask rid bit
> +    vaddr &= PAGE_MASK;
> +    ret = tlb_track_insert_or_dirty(vcpu->domain->arch.tlb_track,
> +                                    &vcpu->domain->arch.mm,
> +                                    entry->ptep, entry->used,
> +                                    vaddr, rid);
> +    if (ret == TLB_TRACK_AGAIN) {
> +        p2m_entry_set_retry(entry);
> +    }
> +}
> +
> +TLB_TRACK_RET_T
> +tlb_track_search_and_remove(struct tlb_track* tlb_track,
> +                            volatile pte_t* ptep, pte_t old_pte,
> +                            struct tlb_track_entry** entryp)
> +{
> +    unsigned long mfn = pte_pfn(old_pte);
> +    struct list_head* head = tlb_track_hash_head(tlb_track, ptep);
> +    struct tlb_track_entry* entry;
> +
> +    tlb_track->stat.sar++;
> +    if (!pte_tlb_tracking(old_pte)) {
> +        tlb_track->stat.sar_not_tracked++;
> +        return TLB_TRACK_NOT_TRACKED;
> +    }
> +    if (!pte_tlb_inserted(old_pte)) {
> +        BUG_ON(pte_tlb_inserted_many(old_pte));
> +        tlb_track->stat.sar_not_found++;
> +        return TLB_TRACK_NOT_FOUND;
> +    }
> +    if (pte_tlb_inserted_many(old_pte)) {
> +        BUG_ON(!pte_tlb_inserted(old_pte));
> +        tlb_track->stat.sar_many++;
> +        return TLB_TRACK_MANY;
> +    }
> +
> +    spin_lock(&tlb_track->hash_lock);
> +    list_for_each_entry(entry, head, list) {
> +        if (entry->ptep != ptep) {
> +            continue;
> +        }
> +        if (pte_pfn(entry->pte_val) == mfn) {
> +            list_del(&entry->list);
> +            tlb_track->stat.sar_found++;
> +            spin_unlock(&tlb_track->hash_lock);
> +            *entryp = entry;
> +            //tlb_track_entry_printf(entry);
> +#ifdef CONFIG_TLB_TRACK_CNT
> +            //tlb_track_printd("cnt = %ld\n", entry->cnt);
> +#endif
> +            return TLB_TRACK_FOUND;
> +        }
> +        BUG();
> +    }
> +    BUG();
> +    spin_unlock(&tlb_track->hash_lock);
> +    return TLB_TRACK_NOT_TRACKED;
> +}
> +
> +void
> +tlb_track_stat_printf(const struct tlb_track_stat* stat)
> +{
> +    printk("iod %ld\n"
> +           "iod_again %ld\n"
> +           "iod_not_tracked %ld\n"
> +           "iod_force_many %ld\n"
> +           "iod_tracked_many %ld\n"
> +           "iod_tracked_many_del %ld\n"
> +           "iod_found %ld\n"
> +           "iod_new_entry %ld\n"
> +           "iod_new_failed %ld\n"
> +           "iod_new_many %ld\n"
> +           "iod_insert %ld\n"
> +           "iod_dirtied %ld\n"
> +           "sar %ld\n"
> +           "sar_not_tracked %ld\n"
> +           "sar_not_found %ld\n"
> +           "sar_found %ld\n"
> +           "sar_many %ld\n",
> +           stat->iod,
> +           stat->iod_again,
> +           stat->iod_not_tracked,
> +           stat->iod_force_many,
> +           stat->iod_tracked_many,
> +           stat->iod_tracked_many_del,
> +           stat->iod_found,
> +           stat->iod_new_entry,
> +           stat->iod_new_failed,
> +           stat->iod_new_many,
> +           stat->iod_insert,
> +           stat->iod_dirtied,
> +           stat->sar,
> +           stat->sar_not_tracked,
> +           stat->sar_not_found,
> +           stat->sar_found,
> +           stat->sar_many);
> +}
> +
> +// for debug
> +void
> +__tlb_track_entry_printf(const char* func, int line,
> +                         const struct tlb_track_entry* entry)
> +{
> +    char pcpumask_buf[NR_CPUS + 1];
> +    char vcpumask_buf[MAX_VIRT_CPUS + 1];
> +    cpumask_scnprintf(pcpumask_buf, sizeof(pcpumask_buf),
> +                      entry->pcpu_dirty_mask);
> +    vcpumask_scnprintf(vcpumask_buf, sizeof(vcpumask_buf),
> +                       entry->vcpu_dirty_mask);
> +    printk("%s:%d\n"
> +           "\tmfn 0x%016lx\n"
> +           "\told_pte 0x%016lx ptep 0x%p\n"
> +           "\tpte_val 0x%016lx vaddr 0x%016lx rid %ld\n"
> +           "\tpcpu_dirty_mask %s vcpu_dirty_mask %s\n"
> +           "\tentry 0x%p\n",
> +           func, line,
> +           pte_pfn(entry->pte_val),
> +           pte_val(entry->pte_val), entry->ptep, pte_val(*entry->ptep),
> +           entry->vaddr, entry->rid,
> +           pcpumask_buf, vcpumask_buf,
> +           entry);
> +}
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-set-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/asm-ia64/p2m_entry.h
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/include/asm-ia64/p2m_entry.h        Mon Jul 24 21:35:16 2006 +0900
> @@ -0,0 +1,76 @@
> +/******************************************************************************
> + * p2m_entry.h
> + *
> + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> + *
> + */
> +
> +#ifndef __ASM_P2M_ENTRY_H__
> +#define __ASM_P2M_ENTRY_H__
> +
> +#include <asm/tlb_track.h>
> +
> +struct p2m_entry {
> +#define P2M_PTE_ALWAYS_RETRY ((volatile pte_t*) -1)
> +    volatile pte_t*     ptep;
> +    pte_t               used;
> +};
> +
> +static inline void
> +p2m_entry_set(struct p2m_entry* entry, volatile pte_t* ptep, pte_t used)
> +{
> +    entry->ptep = ptep;
> +    entry->used = used;
> +}
> +
> +static inline void
> +p2m_entry_set_retry(struct p2m_entry* entry)
> +{
> +    entry->ptep = P2M_PTE_ALWAYS_RETRY;
> +}
> +
> +static inline int
> +p2m_entry_retry(struct p2m_entry* entry)
> +{
> +    //XXX see lookup_domain_pte().
> +    //    NULL is set for invalid gpaddr for the time being.
> +    if (entry->ptep == NULL)
> +        return 0;
> +
> +    if (entry->ptep == P2M_PTE_ALWAYS_RETRY)
> +        return 1;
> +
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +    return ((pte_val(*entry->ptep) & ~_PAGE_TLB_TRACK_MASK) !=
> +            (pte_val(entry->used) & ~_PAGE_TLB_TRACK_MASK));
> +#else
> +    return (pte_val(*entry->ptep) != pte_val(entry->used));
> +#endif
> +}
> +
> +#endif // __ASM_P2M_ENTRY_H__
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-set-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/asm-ia64/tlb_track.h
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/include/asm-ia64/tlb_track.h        Mon Jul 24 21:35:16 2006 +0900
> @@ -0,0 +1,201 @@
> +/******************************************************************************
> + * tlb_track.c
> + *
> + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> + *
> + */
> +
> +#ifndef __TLB_TRACK_H__
> +#define __TLB_TRACK_H__
> +
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +
> +#include <asm/domain.h>
> +#include <xen/list.h>
> +
> +#define _PAGE_TLB_TRACKING_BIT          53
> +#define _PAGE_TLB_INSERTED_BIT          54
> +#define _PAGE_TLB_INSERTED_MANY_BIT     55
> +
> +#define _PAGE_TLB_TRACKING              (1UL << _PAGE_TLB_TRACKING_BIT)
> +#define _PAGE_TLB_INSERTED              (1UL << _PAGE_TLB_INSERTED_BIT)
> +#define _PAGE_TLB_INSERTED_MANY         (1UL << _PAGE_TLB_INSERTED_MANY_BIT)
> +#define _PAGE_TLB_TRACK_MASK            (_PAGE_TLB_TRACKING | 
> _PAGE_TLB_INSERTED | _PAGE_TLB_INSERTED_MANY)
> +
> +#define pte_tlb_tracking(pte)                   \
> +    ((pte_val(pte) & _PAGE_TLB_TRACKING) != 0)
> +#define pte_tlb_inserted(pte)                   \
> +    ((pte_val(pte) & _PAGE_TLB_INSERTED) != 0)
> +#define pte_tlb_inserted_many(pte)                  \
> +    ((pte_val(pte) & _PAGE_TLB_INSERTED_MANY) != 0)
> +
> +
> +// vcpu mask
> +// stolen from cpumask.h
> +typedef struct { DECLARE_BITMAP(bits, MAX_VIRT_CPUS); } vcpumask_t;
> +
> +#define vcpu_set(vcpu, dst) __vcpu_set((vcpu), &(dst))
> +static inline void __vcpu_set(int vcpu, volatile vcpumask_t *dstp)
> +{
> +     set_bit(vcpu, dstp->bits);
> +}
> +#define vcpus_clear(dst) __vcpus_clear(&(dst), MAX_VIRT_CPUS)
> +static inline void __vcpus_clear(vcpumask_t *dstp, int nbits)
> +{
> +     bitmap_zero(dstp->bits, nbits);
> +}
> +/* No static inline type checking - see Subtlety (1) above. */
> +#define vcpu_isset(vcpu, vcpumask) test_bit((vcpu), (vcpumask).bits)
> +
> +#define vcpumask_scnprintf(buf, len, src) \
> +                     __vcpumask_scnprintf((buf), (len), &(src), 
> MAX_VIRT_CPUS)
> +static inline int __vcpumask_scnprintf(char *buf, int len,
> +                                     const vcpumask_t *srcp, int nbits)
> +{
> +     return bitmap_scnprintf(buf, len, srcp->bits, nbits);
> +}
> +
> +
> +// TODO: compact this structure.
> +struct tlb_track_entry {
> +    struct list_head   list;
> +
> +    
> +    volatile pte_t*     ptep;            // corresponding p2m entry
> +
> +    //XXX should we use TR_ENTRY?
> +    pte_t               pte_val;        // mfn and other flags
> +                                        // pte_val.p = 1:
> +                                        //   tlb entry is inserted.
> +                                        // pte_val.p = 0: 
> +                                        //   once tlb entry is inserted, so
> +                                        //   this entry is created. But tlb
> +                                        //   purge is isseued, so this
> +                                        //   virtual address need not to be
> +                                        //   purged.
> +    unsigned long       vaddr;          // virtual address
> +    unsigned long       rid;            // rid
> +
> +    cpumask_t           pcpu_dirty_mask;
> +    vcpumask_t          vcpu_dirty_mask;
> +    // tlbflush_timestamp;
> +
> +#define CONFIG_TLB_TRACK_CNT
> +#ifdef CONFIG_TLB_TRACK_CNT
> +#define TLB_TRACK_CNT_FORCE_MANY        256 //XXX how many?
> +    unsigned long       cnt;
> +#endif
> +};
> +
> +struct tlb_track_stat {
> +    // insert or dirty
> +    unsigned long       iod;
> +    unsigned long       iod_again;
> +    unsigned long       iod_not_tracked;
> +    unsigned long       iod_force_many;
> +    unsigned long       iod_tracked_many;
> +    unsigned long       iod_tracked_many_del;
> +    unsigned long       iod_found;
> +    unsigned long       iod_new_entry;
> +    unsigned long       iod_new_failed;
> +    unsigned long       iod_new_many;
> +    unsigned long       iod_insert;
> +    unsigned long       iod_dirtied;
> +    
> +    // search and remove
> +    unsigned long       sar;
> +    unsigned long       sar_not_tracked;
> +    unsigned long       sar_not_found;
> +    unsigned long       sar_found;
> +    unsigned long       sar_many;
> +};
> +void tlb_track_stat_printf(const struct tlb_track_stat* stat); 
> +
> +struct tlb_track {
> +
> +// see __gnttab_map_grant_ref()
> +// A domain can map granted-page up to MAPTRACK_MAX_ENTRIES pages.
> +#define TLB_TRACK_LIMIT_ENTRIES                                     \
> +    (MAPTRACK_MAX_ENTRIES * (PAGE_SIZE / sizeof(struct tlb_track)))
> +
> +    spinlock_t                  free_list_lock;
> +    struct list_head            free_list;
> +    unsigned int                limit;
> +    unsigned int                num_entries;
> +    unsigned int                num_free;
> +    struct list_head            page_list;
> +
> +    // XXX hash table size
> +    spinlock_t                  hash_lock;
> +    unsigned int                hash_size;
> +    unsigned int                hash_shift;
> +    unsigned int                hash_mask;
> +    struct list_head*           hash;
> +
> +    struct tlb_track_stat       stat;
> +};
> +
> +int tlb_track_create(struct domain* d);
> +void tlb_track_destroy(struct domain* d);
> +
> +void tlb_track_free_entry(struct tlb_track* tlb_track,
> +                          struct tlb_track_entry* entry);
> +
> +struct p2m_entry;
> +void
> +vcpu_tlb_track_insert_or_dirty(struct vcpu *vcpu, unsigned long vaddr,
> +                               struct p2m_entry* entry);
> +
> +// return value
> +// NULL if this entry is used
> +// entry if this entry isn't used
> +enum TLB_TRACK_RET {
> +    TLB_TRACK_NOT_TRACKED,
> +    TLB_TRACK_NOT_FOUND,
> +    TLB_TRACK_FOUND,
> +    TLB_TRACK_MANY,
> +    TLB_TRACK_AGAIN,
> +};
> +typedef enum TLB_TRACK_RET TLB_TRACK_RET_T;
> +
> +TLB_TRACK_RET_T
> +tlb_track_search_and_remove(struct tlb_track* tlb_track, 
> +                            volatile pte_t* ptep, pte_t old_pte, 
> +                            struct tlb_track_entry** entryp);
> +
> +void
> +__tlb_track_entry_printf(const char* func, int line,
> +                         const struct tlb_track_entry* entry);
> +#define tlb_track_entry_printf(entry)                       \
> +    __tlb_track_entry_printf(__func__, __LINE__, (entry))
> +#else
> +//define nop
> +
> +#endif // CONFIG_XEN_IA64_TLB_TRACK
> +
> +#endif // __TLB_TRACK_H__
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-set-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */

> # HG changeset patch
> # User yamahata@xxxxxxxxxxxxx
> # Node ID a56d48066373c9fe317e986580c08394fe89fc7e
> # Parent  cb0aa2b2e180d76d09592ed32338f9cb4ac5b7a0
> implement per vcpu vhpt option. allocate VHPT per vcpu.
> added compile time option, xen_ia64_pervcpu_vhpt=y, to enable it.
> added xen boot time option, pervcpu_vhpt=0, to disable it.
> This patch depends on tlb tracking patch.
> PATCHNAME: pervcpu_vhpt
> 
> Signed-off-by: Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
> 
> diff -r cb0aa2b2e180 -r a56d48066373 xen/arch/ia64/Rules.mk
> --- a/xen/arch/ia64/Rules.mk  Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/arch/ia64/Rules.mk  Mon Jul 24 21:37:15 2006 +0900
> @@ -42,6 +42,9 @@ ifeq ($(xen_ia64_tlb_track),y)
>  ifeq ($(xen_ia64_tlb_track),y)
>  CFLAGS       += -DCONFIG_XEN_IA64_TLB_TRACK
>  endif
> +ifeq ($(xen_ia64_pervcpu_vhpt),y)
> +CFLAGS       += -DCONFIG_XEN_IA64_PERVCPU_VHPT
> +endif
>  ifeq ($(no_warns),y)
>  CFLAGS       += -Wa,--fatal-warnings -Werror -Wno-uninitialized
>  endif
> diff -r cb0aa2b2e180 -r a56d48066373 xen/arch/ia64/xen/domain.c
> --- a/xen/arch/ia64/xen/domain.c      Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/arch/ia64/xen/domain.c      Mon Jul 24 21:37:15 2006 +0900
> @@ -117,8 +117,12 @@ static void flush_vtlb_for_context_switc
>               if (VMX_DOMAIN(vcpu)) {
>                       // currently vTLB for vt-i domian is per vcpu.
>                       // so any flushing isn't needed.
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +             } else if (HAS_PERVCPU_VHPT(v->domain)) {
> +                     // nothing to do
> +#endif
>               } else {
> -                     vhpt_flush();
> +                     local_vhpt_flush();
>               }
>               local_flush_tlb_all();
>       }
> @@ -133,9 +137,13 @@ void schedule_tail(struct vcpu *prev)
>               vmx_do_launch(current);
>       } else {
>               ia64_set_iva(&ia64_ivt);
> -             ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
> -                     VHPT_ENABLED);
> +             // disable VHPT. ia64_new_rr7() might cause VHPT
> +             // fault without this because it flushes dtr[IA64_TR_VHPT]
> +             // (VHPT_SIZE_LOG2 << 2) is just for avoid
> +             // Reserved Register/Field fault.
> +             ia64_set_pta(VHPT_SIZE_LOG2 << 2);
>               load_region_regs(current);
> +             ia64_set_pta(vcpu_pta(current));
>               vcpu_load_kernel_regs(current);
>               __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
>                 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
> @@ -186,9 +194,13 @@ if (!i--) { i = 1000000; printk("+"); }
>  
>       nd = current->domain;
>       if (!is_idle_domain(nd)) {
> -             ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
> -                          VHPT_ENABLED);
> +             // disable VHPT. ia64_new_rr7() might cause VHPT
> +             // fault without this because it changes dtr[IA64_TR_VHPT]
> +             // (VHPT_SIZE_LOG2 << 2) is just for avoid
> +             // Reserved Register/Field fault.
> +             ia64_set_pta(VHPT_SIZE_LOG2 << 2);
>               load_region_regs(current);
> +             ia64_set_pta(vcpu_pta(current));
>               vcpu_load_kernel_regs(current);
>               vcpu_set_next_timer(current);
>               if (vcpu_timer_expired(current))
> @@ -305,6 +317,17 @@ struct vcpu *alloc_vcpu_struct(struct do
>           v->arch.ending_rid = d->arch.ending_rid;
>           v->arch.breakimm = d->arch.breakimm;
>           v->arch.last_processor = INVALID_PROCESSOR;
> +
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +        if (HAS_PERVCPU_VHPT(d)) {
> +            if (pervcpu_vhpt_alloc(v) < 0) {
> +                free_xenheap_pages(v->arch.privregs,
> +                                   get_order(sizeof(mapped_regs_t)));
> +                free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
> +                return NULL;
> +            }
> +        }
> +#endif
>       }
>  
>       return v;
> @@ -315,6 +338,10 @@ void free_vcpu_struct(struct vcpu *v)
>       if (VMX_DOMAIN(v))
>               vmx_relinquish_vcpu_resources(v);
>       else {
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +        if (HAS_PERVCPU_VHPT(v->domain))
> +            pervcpu_vhpt_free(v);
> +#endif
>               if (v->arch.privregs != NULL)
>                       free_xenheap_pages(v->arch.privregs,
>                                     get_order_from_shift(XMAPPEDREGS_SHIFT));
> @@ -340,6 +367,11 @@ static void init_switch_stack(struct vcp
>       memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
>  }
>  
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +static int opt_pervcpu_vhpt = 1;
> +integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
> +#endif
> +
>  int arch_domain_create(struct domain *d)
>  {
>       int i;
> @@ -354,6 +386,13 @@ int arch_domain_create(struct domain *d)
>       if (is_idle_domain(d))
>           return 0;
>  
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +     d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
> +#if 1
> +     DPRINTK("%s:%d domain %d pervcpu_vhpt %d\n",
> +             __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
> +#endif
> +#endif
>  #ifdef CONFIG_XEN_IA64_TLB_TRACK
>       if (tlb_track_create(d) < 0)
>         goto fail_nomem;
> diff -r cb0aa2b2e180 -r a56d48066373 xen/arch/ia64/xen/regionreg.c
> --- a/xen/arch/ia64/xen/regionreg.c   Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/arch/ia64/xen/regionreg.c   Mon Jul 24 21:37:15 2006 +0900
> @@ -260,7 +260,7 @@ int set_one_rr(unsigned long rr, unsigne
>       } else if (rreg == 7) {
>               ia64_new_rr7(vmMangleRID(newrrv.rrval),v->domain->shared_info,
>                            v->arch.privregs, v->domain->arch.shared_info_va,
> -                          __get_cpu_var(vhpt_paddr));
> +                          vcpu_vhpt_maddr(v));
>       } else {
>               set_rr(rr,newrrv.rrval);
>       }
> diff -r cb0aa2b2e180 -r a56d48066373 xen/arch/ia64/xen/vhpt.c
> --- a/xen/arch/ia64/xen/vhpt.c        Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/arch/ia64/xen/vhpt.c        Mon Jul 24 21:37:15 2006 +0900
> @@ -23,18 +23,30 @@ DEFINE_PER_CPU (unsigned long, vhpt_padd
>  DEFINE_PER_CPU (unsigned long, vhpt_paddr);
>  DEFINE_PER_CPU (unsigned long, vhpt_pend);
>  
> -void vhpt_flush(void)
> -{
> -     struct vhpt_lf_entry *v = __va(__ia64_per_cpu_var(vhpt_paddr));
> +static void __vhpt_flush(unsigned long vhpt_maddr)
> +{
> +     struct vhpt_lf_entry *v =(struct vhpt_lf_entry*)__va(vhpt_maddr);
>       int i;
>  
>       for (i = 0; i < VHPT_NUM_ENTRIES; i++, v++)
>               v->ti_tag = INVALID_TI_TAG;
>  }
>  
> -static void vhpt_erase(void)
> -{
> -     struct vhpt_lf_entry *v = (struct vhpt_lf_entry *)VHPT_ADDR;
> +void local_vhpt_flush(void)
> +{
> +     __vhpt_flush(__ia64_per_cpu_var(vhpt_paddr));
> +}
> +
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +static void vcpu_vhpt_flush(struct vcpu* v)
> +{
> +     __vhpt_flush(vcpu_vhpt_maddr(v));
> +}
> +#endif
> +
> +static void vhpt_erase(unsigned long vhpt_maddr)
> +{
> +     struct vhpt_lf_entry *v = (struct vhpt_lf_entry*)__va(vhpt_maddr);
>       int i;
>  
>       for (i = 0; i < VHPT_NUM_ENTRIES; i++, v++) {
> @@ -44,17 +56,6 @@ static void vhpt_erase(void)
>               v->ti_tag = INVALID_TI_TAG;
>       }
>       // initialize cache too???
> -}
> -
> -
> -static void vhpt_map(unsigned long pte)
> -{
> -     unsigned long psr;
> -
> -     psr = ia64_clear_ic();
> -     ia64_itr(0x2, IA64_TR_VHPT, VHPT_ADDR, pte, VHPT_SIZE_LOG2);
> -     ia64_set_psr(psr);
> -     ia64_srlz_i();
>  }
>  
>  void vhpt_insert (unsigned long vadr, unsigned long pte, unsigned long logps)
> @@ -101,7 +102,7 @@ void vhpt_multiple_insert(unsigned long 
>  
>  void vhpt_init(void)
>  {
> -     unsigned long paddr, pte;
> +     unsigned long paddr;
>       struct page_info *page;
>  #if !VHPT_ENABLED
>       return;
> @@ -121,13 +122,54 @@ void vhpt_init(void)
>       __get_cpu_var(vhpt_pend) = paddr + (1 << VHPT_SIZE_LOG2) - 1;
>       printf("vhpt_init: vhpt paddr=0x%lx, end=0x%lx\n",
>               paddr, __get_cpu_var(vhpt_pend));
> -     pte = pte_val(pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL));
> -     vhpt_map(pte);
> -     ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
> -             VHPT_ENABLED);
> -     vhpt_erase();
> -}
> -
> +     vhpt_erase(paddr);
> +     // we don't enable VHPT here.
> +     // context_switch() or schedule_tail() does it.
> +}
> +
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +int
> +pervcpu_vhpt_alloc(struct vcpu *v)
> +{
> +     unsigned long vhpt_size_log2 = VHPT_SIZE_LOG2;
> +     DPRINTK("%s:%d allocating d 0x%p %d v 0x%p %d\n",
> +             __func__, __LINE__,
> +             v->domain, v->domain->domain_id,
> +             v, v->vcpu_id);
> +
> +     v->arch.vhpt_entries =
> +             (1UL << vhpt_size_log2) / sizeof(struct vhpt_lf_entry);
> +     v->arch.vhpt_page =
> +             alloc_domheap_pages(NULL, vhpt_size_log2 - PAGE_SHIFT, 0);
> +     if (!v->arch.vhpt_page)
> +             return -ENOMEM;
> +     
> +     v->arch.vhpt_maddr = page_to_maddr(v->arch.vhpt_page);
> +     if (v->arch.vhpt_maddr & ((1 << VHPT_SIZE_LOG2) - 1))
> +             panic("pervcpu_vhpt_init: bad VHPT alignment!\n");
> +
> +     v->arch.pta.val = 0; // zero clear
> +     v->arch.pta.ve = 1; // enable vhpt
> +     v->arch.pta.size = VHPT_SIZE_LOG2;
> +     v->arch.pta.vf = 1; // long format
> +     v->arch.pta.base = v->arch.vhpt_maddr >> 15;
> +
> +     vhpt_erase(v->arch.vhpt_maddr);
> +     return 0;
> +}
> +
> +void
> +pervcpu_vhpt_free(struct vcpu *v)
> +{
> +     unsigned long vhpt_size_log2 = VHPT_SIZE_LOG2;
> +     DPRINTK("%s:%d freeing d 0x%p %d v 0x%p %d\n",
> +             __func__, __LINE__,
> +             v->domain, v->domain->domain_id,
> +             v, v->vcpu_id);
> +
> +     free_domheap_pages(v->arch.vhpt_page, vhpt_size_log2 - PAGE_SHIFT);
> +}
> +#endif
>  
>  void vcpu_flush_vtlb_all(struct vcpu *v)
>  {
> @@ -136,7 +178,15 @@ void vcpu_flush_vtlb_all(struct vcpu *v)
>       vcpu_purge_tr_entry(&PSCBX(v,itlb));
>  
>       /* Then VHPT.  */
> -     vhpt_flush ();
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +     if (HAS_PERVCPU_VHPT(v->domain.arch)) {
> +             vcpu_vhpt_flush(v);
> +     } else {
> +             local_vhpt_flush();
> +     }
> +#else
> +     local_vhpt_flush();
> +#endif
>  
>       /* Then mTLB.  */
>       local_flush_tlb_all ();
> @@ -169,9 +219,10 @@ void domain_flush_vtlb_all (void)
>       }
>  }
>  
> -static void cpu_flush_vhpt_range (int cpu, u64 vadr, u64 addr_range)
> -{
> -     void *vhpt_base = __va(per_cpu(vhpt_paddr, cpu));
> +static void __flush_vhpt_range(unsigned long vhpt_maddr,
> +                              u64 vadr, u64 addr_range)
> +{
> +     void *vhpt_base = __va(vhpt_maddr);
>  
>       while ((long)addr_range > 0) {
>               /* Get the VHPT entry.  */
> @@ -184,9 +235,30 @@ static void cpu_flush_vhpt_range (int cp
>       }
>  }
>  
> +static void cpu_vhpt_flush_range(int cpu, u64 vadr, u64 addr_range)
> +{
> +     __flush_vhpt_range(per_cpu(vhpt_paddr, cpu), vadr, addr_range);
> +}
> +
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +static void vcpu_vhpt_flush_range(struct vcpu* v, u64 vadr, u64 addr_range)
> +{
> +     __flush_vhpt_range(vcpu_vhpt_maddr(v), vadr, addr_range);
> +}
> +#endif
> +
>  void vcpu_flush_tlb_vhpt_range (u64 vadr, u64 log_range)
>  {
> -     cpu_flush_vhpt_range (current->processor, vadr, 1UL << log_range);
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +     if (HAS_PERVCPU_VHPT(current->domain.arch)) {
> +             vcpu_vhpt_flush_range(current, vadr, 1UL << log_range);
> +     } else {
> +             cpu_vhpt_flush_range(current->processor,
> +                                  vadr, 1UL << log_range);
> +     }
> +#else
> +     cpu_vhpt_flush_range(current->processor, vadr, 1UL << log_range);
> +#endif
>       ia64_ptcl(vadr, log_range << 2);
>       ia64_srlz_i();
>  }
> @@ -218,8 +290,17 @@ void domain_flush_vtlb_range (struct dom
>               if (!test_bit(_VCPUF_initialised, &v->vcpu_flags))
>                       continue;
>  
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +             if (HAS_PERVCPU_VHPT(d->arch)) {
> +                     vcpu_vhpt_flush_range(v, vadr, addr_range);
> +             } else {
> +                     /* Invalidate VHPT entries.  */
> +                     cpu_vhpt_flush_range(v->processor, vadr, addr_range);
> +             }
> +#else
>               /* Invalidate VHPT entries.  */
> -             cpu_flush_vhpt_range (v->processor, vadr, addr_range);
> +             cpu_vhpt_flush_range(v->processor, vadr, addr_range);
> +#endif
>       }
>       // ptc.ga has release semantics.
>  
> @@ -254,11 +335,30 @@ domain_flush_vltb_track_entry(struct dom
>       }
>       smp_mb();
>  
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +     if (HAS_PERVCPU_VHPT(d->arch)) {
> +             for_each_vcpu(d, v) {
> +                     if (!test_bit(_VCPUF_initialised, &v->vcpu_flags))
> +                             continue;
> +                     if (!vcpu_isset(v->vcpu_id, entry->vcpu_dirty_mask))
> +                             continue;
> +
> +                     /* Invalidate VHPT entries.  */
> +                     vcpu_vhpt_flush_range(v, entry->vaddr, PAGE_SIZE);
> +             }
> +     } else {
> +             for_each_cpu_mask(cpu, entry->pcpu_dirty_mask) {
> +                     /* Invalidate VHPT entries.  */
> +                     cpu_vhpt_flush_range(cpu, entry->vaddr, PAGE_SIZE);
> +             }
> +     }
> +#else
>       for_each_cpu_mask(cpu, entry->pcpu_dirty_mask) {
>               //printk("%s:%d cpu %d\n", __func__, __LINE__, cpu);
>               /* Invalidate VHPT entries.  */
> -             cpu_flush_vhpt_range(cpu, entry->vaddr, PAGE_SIZE);
> -     }
> +             cpu_vhpt_flush_range(cpu, entry->vaddr, PAGE_SIZE);
> +     }
> +#endif
>       // ptc.ga has release semantics.
>  
>       /* ptc.ga  */
> @@ -272,7 +372,7 @@ static void flush_tlb_vhpt_all (struct d
>  static void flush_tlb_vhpt_all (struct domain *d)
>  {
>       /* First VHPT.  */
> -     vhpt_flush ();
> +     local_vhpt_flush ();
>  
>       /* Then mTLB.  */
>       local_flush_tlb_all ();
> @@ -281,7 +381,14 @@ void domain_flush_destroy (struct domain
>  void domain_flush_destroy (struct domain *d)
>  {
>       /* Very heavy...  */
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +     if (HAS_PERVCPU_VHPT(d->arch))
> +             on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1, 1);
> +     else
> +             on_each_cpu((void (*)(void *))flush_tlb_vhpt_all, d, 1, 1);
> +#else
>       on_each_cpu ((void (*)(void *))flush_tlb_vhpt_all, d, 1, 1);
> +#endif
>       cpus_clear (d->domain_dirty_cpumask);
>  }
>  
> diff -r cb0aa2b2e180 -r a56d48066373 xen/include/asm-ia64/domain.h
> --- a/xen/include/asm-ia64/domain.h   Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/include/asm-ia64/domain.h   Mon Jul 24 21:37:15 2006 +0900
> @@ -63,6 +63,9 @@ struct arch_domain {
>          unsigned long flags;
>          struct {
>              unsigned int is_vti : 1;
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +            unsigned int has_pervcpu_vhpt : 1;
> +#endif
>          };
>      };
>  
> @@ -108,6 +111,13 @@ struct arch_domain {
>  #define INT_ENABLE_OFFSET(v)                   \
>      (sizeof(vcpu_info_t) * (v)->vcpu_id + \
>      offsetof(vcpu_info_t, evtchn_upcall_mask))
> +
> +#ifdef CONFIG_XEN_IA64_PER_VCPU_VHPT
> +#define HAS_PERVCPU_VHPT(d)     ((d)->has_pervcpu_vhpt)
> +#else
> +#define HAS_PERVCPU_VHPT(d)     (0)
> +#endif
> +
>  
>  struct arch_vcpu {
>      /* Save the state of vcpu.
> @@ -158,6 +168,13 @@ struct arch_vcpu {
>      fpswa_ret_t fpswa_ret;   /* save return values of FPSWA emulation */
>      struct arch_vmx_struct arch_vmx; /* Virtual Machine Extensions */
>  
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +    PTA                 pta;
> +    unsigned long       vhpt_maddr;
> +    struct page_info*   vhpt_page;
> +    unsigned long       vhpt_entries;
> +#endif
> +
>  #define INVALID_PROCESSOR       INT_MAX
>      int last_processor;
>  };
> diff -r cb0aa2b2e180 -r a56d48066373 xen/include/asm-ia64/vhpt.h
> --- a/xen/include/asm-ia64/vhpt.h     Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/include/asm-ia64/vhpt.h     Mon Jul 24 21:37:15 2006 +0900
> @@ -42,11 +42,47 @@ extern void vhpt_multiple_insert(unsigne
>                                unsigned long logps);
>  extern void vhpt_insert (unsigned long vadr, unsigned long pte,
>                        unsigned long logps);
> -void vhpt_flush(void);
> +void local_vhpt_flush(void);
>  
>  /* Currently the VHPT is allocated per CPU.  */
>  DECLARE_PER_CPU (unsigned long, vhpt_paddr);
>  DECLARE_PER_CPU (unsigned long, vhpt_pend);
>  
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +#if !VHPT_ENABLED
> +#error "VHPT_ENABLED must be set for CONFIG_XEN_IA64_PERVCPU_VHPT"
> +#endif
> +#include <xen/sched.h>
> +int pervcpu_vhpt_alloc(struct vcpu *v);
> +void pervcpu_vhpt_free(struct vcpu *v);
> +static inline unsigned long
> +vcpu_vhpt_maddr(struct vcpu* v)
> +{
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +    if (HAS_PERVCPU_VHPT(v->domain)) {
> +        return v->arch.vhpt_maddr;
> +    }
> +#endif
> +
> +#if 0
> +    // referencecing v->processor is racy.
> +    return per_cpu(vhpt_paddr, v->processor);
> +#endif
> +    BUG_ON(v != current);
> +    return __get_cpu_var(vhpt_paddr);
> +}
> +
> +static inline unsigned long
> +vcpu_pta(struct vcpu* v)
> +{
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +    if (HAS_PERVCPU_VHPT(v->domain)) {
> +        return v->arch.pta.val;
> +    }
> +#endif
> +    return VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) | VHPT_ENABLED;
> +}
> +#endif
> +
>  #endif /* !__ASSEMBLY */
>  #endif

> _______________________________________________
> Xen-ia64-devel mailing list
> Xen-ia64-devel@xxxxxxxxxxxxxxxxxxx
> http://lists.xensource.com/xen-ia64-devel

-- 
yamahata

Attachment: 10701:3cee9325a6c6_import_linux_hash.h.patch
Description: Text document

Attachment: 10702:b90fff753ca1_tlb_track.patch
Description: Text document

Attachment: 10703:f9b91b850f7b_pervcpu_vhpt.patch
Description: Text document

_______________________________________________
Xen-ia64-devel mailing list
Xen-ia64-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-ia64-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.