[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH RESEND v5 5/6] xen/arm: Implement hypercall for dirty page tracing



On Fri, 2013-11-08 at 16:50 +0900, Jaeyong Yoo wrote:
> Add hypercall (shadow op: enable/disable and clean/peek dirtied page bitmap).
> It consists of two parts: dirty page detecting and saving.
> For detecting, we setup the guest p2m's leaf PTE read-only and whenever the 
> guest
> tries to write something, permission fault happens and traps into xen.
> The permission-faulted GPA should be saved for the toolstack (when it wants 
> to see
> which pages are dirtied). For this purpose, we temporarily save the GPAs into 
> bitmap.
> 
> Changes from v4:
> 1. For temporary saving dirty pages, use bitmap rather than linked list.
> 2. Setup the p2m's second level page as read-write in the view of xen's 
> memory access.
>    It happens in p2m_create_table function.
> 
> Signed-off-by: Jaeyong Yoo <jaeyong.yoo@xxxxxxxxxxx>
> ---
>  xen/arch/arm/domain.c           |  14 +++
>  xen/arch/arm/domctl.c           |   9 ++
>  xen/arch/arm/mm.c               | 103 +++++++++++++++++++-
>  xen/arch/arm/p2m.c              | 206 
> ++++++++++++++++++++++++++++++++++++++++
>  xen/arch/arm/traps.c            |   9 ++
>  xen/include/asm-arm/domain.h    |   7 ++
>  xen/include/asm-arm/mm.h        |   7 ++
>  xen/include/asm-arm/p2m.h       |   4 +
>  xen/include/asm-arm/processor.h |   2 +
>  9 files changed, 360 insertions(+), 1 deletion(-)
> 
> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> index c0b5dd8..0a32301 100644
> --- a/xen/arch/arm/domain.c
> +++ b/xen/arch/arm/domain.c
> @@ -215,6 +215,12 @@ static void ctxt_switch_to(struct vcpu *n)
>      WRITE_SYSREG(hcr, HCR_EL2);
>      isb();
>  
> +    /* for dirty-page tracing
> +     * XXX: how do we consider SMP case?
> +     */
> +    if ( n->domain->arch.dirty.mode )
> +        restore_vlpt(n->domain);

This is an interesting question. xen_second is shared between all pcpus,
which means that the vlpt is currently only usable from a single
physical CPU at a time.

Currently the only per-cpu area is the domheap region from 2G-4G. We
could steal some address space from the top or bottom of there?

>      /* This is could trigger an hardware interrupt from the virtual
>       * timer. The interrupt needs to be injected into the guest. */
>      virt_timer_restore(n);
> @@ -509,11 +515,19 @@ int arch_domain_create(struct domain *d, unsigned int 
> domcr_flags)
>      /* Default the virtual ID to match the physical */
>      d->arch.vpidr = boot_cpu_data.midr.bits;
>  
> +    /* init for dirty-page tracing */
> +    d->arch.dirty.count = 0;
> +    d->arch.dirty.mode = 0;
> +    spin_lock_init(&d->arch.dirty.lock);
> +
>      d->arch.dirty.second_lvl_start = 0;
>      d->arch.dirty.second_lvl_end = 0;
>      d->arch.dirty.second_lvl[0] = NULL;
>      d->arch.dirty.second_lvl[1] = NULL;
>  
> +    memset(d->arch.dirty.bitmap, 0, sizeof(d->arch.dirty.bitmap));
> +    d->arch.dirty.bitmap_pages = 0;
> +
>      clear_page(d->shared_info);
>      share_xen_page_with_guest(
>          virt_to_page(d->shared_info), d, XENSHARE_writable);
> diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
> index cb38e59..eb74225 100644
> --- a/xen/arch/arm/domctl.c
> +++ b/xen/arch/arm/domctl.c
> @@ -93,6 +93,15 @@ long arch_do_domctl(struct xen_domctl *domctl, struct 
> domain *d,
>              xfree(c.data);
>      }
>      break;
> +    case XEN_DOMCTL_shadow_op:
> +    {
> +        domain_pause(d);
> +        ret = dirty_mode_op(d, &domctl->u.shadow_op);
> +        domain_unpause(d);
> +
> +        copyback = 1;
> +    }
> +    break;
>  
>      default:
>          return -EINVAL;
> diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
> index bf13993..d5a0a11 100644
> --- a/xen/arch/arm/mm.c
> +++ b/xen/arch/arm/mm.c
> @@ -845,7 +845,6 @@ void destroy_xen_mappings(unsigned long v, unsigned long 
> e)
>      create_xen_entries(REMOVE, v, 0, (e - v) >> PAGE_SHIFT, 0);
>  }
>  
> -enum mg { mg_clear, mg_ro, mg_rw, mg_rx };
>  static void set_pte_flags_on_range(const char *p, unsigned long l, enum mg 
> mg)
>  {
>      lpae_t pte;
> @@ -1320,6 +1319,60 @@ int is_iomem_page(unsigned long mfn)
>   *  xen: arm: 64-bit guest support and domU FDT autogeneration
>   *  will be upstreamed.
>   */
> +
> +static inline void mark_dirty_bitmap(struct domain *d, paddr_t addr)
> +{
> +    paddr_t ram_base = (paddr_t) GUEST_RAM_BASE;
> +    int bit_index = PFN_DOWN(addr - ram_base);
> +    int page_index = bit_index >> (PAGE_SHIFT + 3);

+3?

> +    int bit_index_residual = bit_index & ((1ul << (PAGE_SHIFT + 3)) - 1);
> +
> +    set_bit(bit_index_residual, d->arch.dirty.bitmap[page_index]);
> +}
> +
> +/* routine for dirty-page tracing
> + *
> + * On first write, it page faults, its entry is changed to read-write,
> + * and on retry the write succeeds.
> + *
> + * for locating p2m of the faulting entry, we use virtual-linear page table.
> + * returns zero if addr is not valid or dirty mode is not set
> + */
> +int handle_page_fault(struct domain *d, paddr_t addr)
> +{
> +
> +    lpae_t *vlp2m_pte = 0;
> +    paddr_t gma_start = 0;
> +    paddr_t gma_end = 0;
> +
> +    if ( !d->arch.dirty.mode ) return 0;
> +    get_gma_start_end(d, &gma_start, &gma_end);
> +
> +    /* Ensure that addr is inside guest's RAM */
> +    if ( addr < gma_start ||
> +         addr > gma_end ) return 0;
> +
> +    vlp2m_pte = get_vlpt_3lvl_pte(addr);
> +    if ( vlp2m_pte->p2m.valid && vlp2m_pte->p2m.write == 0 &&
> +         vlp2m_pte->p2m.avail == 0 /* reuse avail bit as read-only */ )
> +    {
> +        lpae_t pte = *vlp2m_pte;
> +        pte.p2m.write = 1;
> +        write_pte(vlp2m_pte, pte);

Do we not need the p2m lock around here somewhere?

> +        flush_tlb_local();

If SMP was working this would need to be inner shareable to cope with
vcpus running on other pcpus.

> +        /* only necessary to lock between get-dirty bitmap and mark dirty
> +         * bitmap. If get-dirty bitmap happens immediately before this
> +         * lock, the corresponding dirty-page would be marked at the next
> +         * round of get-dirty bitmap */
> +        spin_lock(&d->arch.dirty.lock);
> +        mark_dirty_bitmap(d, addr);
> +        spin_unlock(&d->arch.dirty.lock);

Might an atomic set_bit suffice rather than the lock?

> +    }
> +
> +    return 1;
> +}
> +
>  void get_gma_start_end(struct domain *d, paddr_t *start, paddr_t *end)
>  {
>      if ( start )
> @@ -1440,6 +1493,54 @@ void cleanup_vlpt(struct domain *d)
>      unmap_domain_page_global(d->arch.dirty.second_lvl[0]);
>      unmap_domain_page_global(d->arch.dirty.second_lvl[1]);
>  }
> +
> +int prepare_bitmap(struct domain *d)
> +{
> +    paddr_t gma_start = 0;
> +    paddr_t gma_end = 0;
> +    int nr_bytes;
> +    int nr_pages;
> +    int i;
> +
> +    get_gma_start_end(d, &gma_start, &gma_end);
> +
> +    nr_bytes = (PFN_DOWN(gma_end - gma_start) + 7) / 8;
> +    nr_pages = (nr_bytes + PAGE_SIZE - 1) / PAGE_SIZE;
> +
> +    BUG_ON( nr_pages > MAX_DIRTY_BITMAP_PAGES );
> +
> +    for ( i = 0; i < nr_pages; ++i )
> +    {
> +        struct page_info *page;
> +        page = alloc_domheap_page(NULL, 0);
> +        if ( page == NULL )
> +            goto cleanup_on_failure;
> +
> +        d->arch.dirty.bitmap[i] = 
> map_domain_page_global(__page_to_mfn(page));

I think you may as well just use the xenheap here.

> +/* Change types across all p2m entries in a domain */
> +void p2m_change_entry_type_global(struct domain *d, enum mg nt)
> +{
> +    struct p2m_domain *p2m = &d->arch.p2m;
> +    paddr_t ram_base;
> +    int i1, i2, i3;
> +    int first_index, second_index, third_index;
> +    lpae_t *first = __map_domain_page(p2m->first_level);
> +    lpae_t pte, *second = NULL, *third = NULL;
> +
> +    get_gma_start_end(d, &ram_base, NULL);
> +
> +    first_index = first_table_offset((uint64_t)ram_base);
> +    second_index = second_table_offset((uint64_t)ram_base);
> +    third_index = third_table_offset((uint64_t)ram_base);

Are those casts needed?

> +
> +    BUG_ON( !first && "Can't map first level p2m." );
> +
> +    spin_lock(&p2m->lock);
> +
> +    for ( i1 = first_index; i1 < LPAE_ENTRIES*2; ++i1 )

You avoid iterating over stuff below ram_base but you aren't worried
about ram end?

> +    {
> +        lpae_walk_t first_pte = first[i1].walk;
> +        if ( !first_pte.valid || !first_pte.table )
> +            goto out;
> +
> +        second = map_domain_page(first_pte.base);
> +        BUG_ON( !second && "Can't map second level p2m.");
> +        for ( i2 = second_index; i2 < LPAE_ENTRIES; ++i2 )
> +        {
> +            lpae_walk_t second_pte = second[i2].walk;
> +
> +            if ( !second_pte.valid || !second_pte.table )
> +                goto out;

goto out is a bit strong. Don't you want to move on to the next second
level page?

> +
> +            third = map_domain_page(second_pte.base);
> +            BUG_ON( !third && "Can't map third level p2m.");
> +
> +            for ( i3 = third_index; i3 < LPAE_ENTRIES; ++i3 )
> +            {
> +                lpae_walk_t third_pte = third[i3].walk;
> +                if ( !third_pte.valid )
> +                    goto out;
> +
> +                pte = third[i3];
> +                if ( nt == mg_ro )
> +                {
> +                    if ( pte.p2m.write == 1 )
> +                    {
> +                        pte.p2m.write = 0;
> +                        pte.p2m.avail = 0;
> +                    }
> +                    else
> +                    {
> +                        /* reuse avail bit as an indicator
> +                         * of 'actual' read-only */
> +                        pte.p2m.avail = 1;
> +                    }

This inner if is equivalent to
        pte.p2m.avail = pte.p2m.write;
        pte.p2m.write = 1;

> +                }
> +                else if ( nt == mg_rw )
> +                {
> +                    if ( pte.p2m.write == 0 && pte.p2m.avail == 0 )
> +                    {
> +                        pte.p2m.write = 1;
> +                    }

and this one is:
        pte.p2m.write = pte.p2m.avail;

> +                }
> +                write_pte(&third[i3], pte);
> +            }
> +            unmap_domain_page(third);
> +
> +            third = NULL;
> +            third_index = 0;
> +        }
> +        unmap_domain_page(second);
> +
> +        second = NULL;
> +        second_index = 0;
> +        third_index = 0;
> +    }
> +
> +out:
> +    flush_tlb_all_local();
> +    if ( third ) unmap_domain_page(third);
> +    if ( second ) unmap_domain_page(second);
> +    if ( first ) unmap_domain_page(first);
> +
> +    spin_unlock(&p2m->lock);
> +}
> +
> +/* Read a domain's log-dirty bitmap and stats.
> + * If the operation is a CLEAN, clear the bitmap and stats. */
> +int log_dirty_op(struct domain *d, xen_domctl_shadow_op_t *sc)
> +{
> +    int peek = 1;
> +    int i;
> +    int bitmap_size;
> +    paddr_t gma_start, gma_end;
> +
> +    /* this hypercall is called from domain 0, and we don't know which 
> guest's
> +     * vlpt is mapped in xen_second, so, to be sure, we restore vlpt here */
> +    restore_vlpt(d);
> +
> +    get_gma_start_end(d, &gma_start, &gma_end);
> +    bitmap_size = (gma_end - gma_start) / 8;
> +
> +    if ( guest_handle_is_null(sc->dirty_bitmap) )
> +    {
> +        peek = 0;

What do you do with this?

> +    }
> +    else
> +    {
> +        spin_lock(&d->arch.dirty.lock);
> +        for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
> +        {
> +            int j = 0;
> +            uint8_t *bitmap;
> +            copy_to_guest_offset(sc->dirty_bitmap, i * PAGE_SIZE,
> +                                 d->arch.dirty.bitmap[i],
> +                                 bitmap_size < PAGE_SIZE ? bitmap_size :
> +                                                           PAGE_SIZE);

The bitmap is always a mutliple of page_size, isn't it?

> +            bitmap_size -= PAGE_SIZE;
> +
> +            /* set p2m page table read-only */
> +            bitmap = d->arch.dirty.bitmap[i];
> +            while ((j = find_next_bit((const long unsigned int *)bitmap,
> +                                      PAGE_SIZE*8, j)) < PAGE_SIZE*8)
> +            {
> +                lpae_t *vlpt;
> +                paddr_t addr = gma_start +
> +                               (i << (2*PAGE_SHIFT+3)) +
> +                               (j << PAGE_SHIFT);
> +                vlpt = get_vlpt_3lvl_pte(addr);
> +                vlpt->p2m.write = 0;

Needs to be a write_pte type construct I think.

> +                j++;
> +            }
> +        }
> +
> +        if ( sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN )
> +        {
> +            for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
> +            {
> +                clear_page(d->arch.dirty.bitmap[i]);
> +            }
> +        }
> +
> +        spin_unlock(&d->arch.dirty.lock);

Ah, I see why the lock in the handler is needed.

This holds the lock over quite a long period. Could it be made more
granular by copying and clearing each page in sequence, dropping the
lock between pages?

> diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
> index 287dd7b..1a7ed11 100644
> --- a/xen/arch/arm/traps.c
> +++ b/xen/arch/arm/traps.c
> @@ -1321,6 +1321,8 @@ static void do_trap_data_abort_guest(struct 
> cpu_user_regs *regs,
>      const char *msg;
>      int rc, level = -1;
>      mmio_info_t info;
> +    int page_fault = ( (dabt.dfsc & FSC_MASK) ==
> +                          (FSC_FLT_PERM | FSC_3D_LEVEL) && dabt.write );

I think you can use FSC_TYPE_MASK and FSC_LL_MASK here, can't you?

I think this would be better off refactored into a
dabt_is_page_fault(dabt), used in the test below. That would allow you
to more easily comment on why these particular conditions are the ones
we care about etc.

>      if ( !check_conditional_instr(regs, hsr) )
>      {
> @@ -1342,6 +1344,13 @@ static void do_trap_data_abort_guest(struct 
> cpu_user_regs *regs,
>      if ( rc == -EFAULT )
>          goto bad_data_abort;
>  
> +    /* domU page fault handling for guest live migration */
> +    /* dabt.valid can be 0 here */
> +    if ( page_fault && handle_page_fault(current->domain, info.gpa) )
> +    {
> +        /* Do not modify pc after page fault to repeat memory operation */
> +        return;
> +    }
>      /* XXX: Decode the instruction if ISS is not valid */
>      if ( !dabt.valid )
>          goto bad_data_abort;
> diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
> index 4f366f1..180d924 100644
> --- a/xen/include/asm-arm/domain.h
> +++ b/xen/include/asm-arm/domain.h
> @@ -114,9 +114,16 @@ struct arch_domain
>  
>      /* dirty-page tracing */
>      struct {
> +#define MAX_DIRTY_BITMAP_PAGES 64        /* support upto 8GB guest memory */
> +        spinlock_t lock;                 /* protect list: head, mvn_head */
> +        volatile int mode;               /* 1 if dirty pages tracing enabled 
> */
> +        volatile unsigned int count;     /* dirty pages counter */

More unnecessary volatiles i think.

>          volatile int second_lvl_start;   /* for context switch */
>          volatile int second_lvl_end;
>          lpae_t *second_lvl[2];           /* copy of guest p2m's first */
> +        /* dirty bitmap */
> +        uint8_t *bitmap[MAX_DIRTY_BITMAP_PAGES];
> +        int bitmap_pages;                /* number of dirty bitmap pages */
>      } dirty;
>  
>  }  __cacheline_aligned;
> diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
> index a74e135..1ce7a4b 100644
> --- a/xen/include/asm-arm/mm.h
> +++ b/xen/include/asm-arm/mm.h
> @@ -341,11 +341,18 @@ static inline void put_page_and_type(struct page_info 
> *page)
>      put_page(page);
>  }
>  
> +enum mg { mg_clear, mg_ro, mg_rw, mg_rx };
> +
> +/* routine for dirty-page tracing */
> +int handle_page_fault(struct domain *d, paddr_t addr);
>  void get_gma_start_end(struct domain *d, paddr_t *start, paddr_t *end);
>  int prepare_vlpt(struct domain *d);
>  void cleanup_vlpt(struct domain *d);
>  void restore_vlpt(struct domain *d);
>  
> +int prepare_bitmap(struct domain *d);
> +void cleanup_bitmap(struct domain *d);
> +
>  /* calculate the xen's virtual address for accessing the leaf PTE of
>   * a given address (GPA) */
>  static inline lpae_t * get_vlpt_3lvl_pte(paddr_t addr)
> diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
> index c660820..dba9a7b 100644
> --- a/xen/include/asm-arm/p2m.h
> +++ b/xen/include/asm-arm/p2m.h
> @@ -2,6 +2,7 @@
>  #define _XEN_P2M_H
>  
>  #include <xen/mm.h>
> +#include <public/domctl.h>
>  
>  struct domain;
>  
> @@ -110,6 +111,9 @@ static inline int get_page_and_type(struct page_info 
> *page,
>      return rc;
>  }
>  
> +void p2m_change_entry_type_global(struct domain *d, enum mg nt);
> +long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc);
> +
>  #endif /* _XEN_P2M_H */
>  
>  /*
> diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
> index 5294421..fced6ad 100644
> --- a/xen/include/asm-arm/processor.h
> +++ b/xen/include/asm-arm/processor.h
> @@ -399,6 +399,8 @@ union hsr {
>  #define FSC_CPR        (0x3a) /* Coprocossor Abort */
>  
>  #define FSC_LL_MASK    (0x03<<0)
> +#define FSC_MASK       (0x3f) /* Fault status mask */
> +#define FSC_3D_LEVEL   (0x03) /* Third level fault*/
>  
>  /* Time counter hypervisor control register */
>  #define CNTHCTL_PA      (1u<<0)  /* Kernel/user access to physical counter */



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.