[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v4 8/9] xen/arm: Implement hypercall for dirty page tracing



On Fri, 2013-10-04 at 13:44 +0900, Jaeyong Yoo wrote:
> Add hypercall (shadow op: enable/disable and clean/peek dirted page bitmap).

"dirtied"

> It consists of two parts: dirty page detecting and saving.
> For detecting, we setup the guest p2m's leaf PTE read-only and whenever
> the guest tries to write something, permission fault happens and traps into 
> xen.
> The permission-faulted GPA should be saved for the toolstack (when it wants 
> to see
> which pages are dirted). For this purpose, we temporarily save the GPAs into 
> linked

"dirtied" again

> list by using 'add_mapped_vaddr' function and when toolstack wants
> (log_dirty_op function) the GPAs are copied into bitmap and the linnked list 
> is flushed.

"linked"

>      spin_lock_init(&d->arch.map_lock);
>      d->arch.map_domain.nr_banks = 0;
>  
> +    /* init for dirty-page tracing */
> +    d->arch.dirty.count = 0;
> +    d->arch.dirty.mode = 0;
> +    d->arch.dirty.head = NULL;
> +    d->arch.dirty.mvn_head = NULL;
> +    spin_lock_init(&d->arch.dirty.lock);
> +
> +    d->arch.dirty.second_lvl_start = 0;
> +    d->arch.dirty.second_lvl_end = 0;
> +    d->arch.dirty.second_lvl = NULL;

I think some/all of these belong in the previous patch which added the
fields.

> +
>      clear_page(d->shared_info);
>      share_xen_page_with_guest(
>          virt_to_page(d->shared_info), d, XENSHARE_writable);
> diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
> index 78fb322..7edce12 100644
> --- a/xen/arch/arm/domctl.c
> +++ b/xen/arch/arm/domctl.c
> @@ -92,6 +92,19 @@ long arch_do_domctl(struct xen_domctl *domctl, struct 
> domain *d,
>              xfree(c.data);
>      }
>      break;
> +    case XEN_DOMCTL_shadow_op:
> +    {
> +        domain_pause(d);
> +        ret = dirty_mode_op(d, &domctl->u.shadow_op);
> +        domain_unpause(d);
> +
> +        if ( (&domctl->u.shadow_op)->op == XEN_DOMCTL_SHADOW_OP_CLEAN ||
> +             (&domctl->u.shadow_op)->op == XEN_DOMCTL_SHADOW_OP_PEEK )

(&domctl->u.shadow_op)->op is just a weird way to write
domctl->u.shadow_op.op isn't it?

Do you only want to copyback on success or is it always correct?

> +        {
> +            copyback = 1;
> +        }
> +    }
> +    break;


> @@ -1460,6 +1459,28 @@ void cleanup_vlpt(struct domain *d)
>      unmap_domain_page_global(d->arch.dirty.second_lvl);
>  }
>  
> +/* routine for dirty-page tracing
> + *
> + * On first write, it page faults, its entry is changed to read-write,
> + * and on retry the write succeeds.
> + *
> + * for locating p2m of the faulting entry, we use virtual-linear page table.
> + */
> +void handle_page_fault(struct domain *d, paddr_t addr)
> +{
> +    lpae_t *vlp2m_pte = get_vlpt_3lvl_pte(addr);
> +    if ( vlp2m_pte->p2m.valid && vlp2m_pte->p2m.write == 0 )

I think you need to distinguish p2m entries which are r/o due to log
dirty from those which are r/o due to wanting them to be r/o.

Maybe we don't have the latter right now? We will eventually need p2m
types I think.

> +    {
> +        lpae_t pte = *vlp2m_pte;
> +        pte.p2m.write = 1;
> +        write_pte(vlp2m_pte, pte);
> +        flush_tlb_local();
> +
> +        /* in order to remove mappings in cleanup stage */
> +        add_mapped_vaddr(d, addr);

No locks or atomic operations here? How are races with the tools reading
the dirty bitmap addressed?  If it is by clever ordering of the checks
and pte writes then I think a comment would be in order.

> +    }
> +}
> +
>  /*
>   * Local variables:
>   * mode: C
> diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
> index 2d09fef..3b2b11a 100644
> --- a/xen/arch/arm/p2m.c
> +++ b/xen/arch/arm/p2m.c
> @@ -6,6 +6,8 @@
>  #include <xen/bitops.h>
>  #include <asm/flushtlb.h>
>  #include <asm/gic.h>
> +#include <xen/guest_access.h>
> +#include <xen/pfn.h>
>  
>  void dump_p2m_lookup(struct domain *d, paddr_t addr)
>  {
> @@ -408,6 +410,304 @@ unsigned long gmfn_to_mfn(struct domain *d, unsigned 
> long gpfn)
>      return p >> PAGE_SHIFT;
>  }
>  
> +#define MAX_VA_PER_NODE (PAGE_SIZE - 2 * sizeof(void *) - sizeof(int))\
> +                        / sizeof(unsigned long)
> +
> +/* An array-based linked list for storing virtual addresses (i.e., the 3rd
> + * level table mapping) that should be destroyed after live migration */
> +struct mapped_va_node
> +{
> +    struct page_info *next;
> +    struct mapped_va_node *mvn_next;
> +    int items;
> +    unsigned long vaddrs[MAX_VA_PER_NODE];
> +};
> +
> +int add_mapped_vaddr(struct domain *d, unsigned long va)
> +{

This function seems awefuly expensive (contains allocations etc) for a
function called on every page fault during a migration.

Why are you remembering the full va of every fault when a single bit per
page would do?

I think you can allocate a bitmap when logdirty is enabled. At a bit per
page it shouldn't be too huge.

You might be able to encode this in the p2m which you walk when the
tools ask for the bit map (i.e. p2m.type==writeable p2m.write==0 =>
dirty), but I think a bitmap would do the trick and be easier to
implement.

> +    struct page_info *page_head;
> +    struct mapped_va_node *mvn_head;
> +
> +    spin_lock(&d->arch.dirty.lock);
> +
> +    page_head = d->arch.dirty.head;
> +    mvn_head = d->arch.dirty.mvn_head;
> +    if ( !mvn_head )
> +    {
> +        page_head = alloc_domheap_page(NULL, 0);
> +        if ( !page_head )
> +        {
> +            spin_unlock(&d->arch.dirty.lock);
> +            return -ENOMEM;
> +        }
> +
> +        mvn_head = map_domain_page_global(__page_to_mfn(page_head));
> +        mvn_head->items = 0;
> +        mvn_head->next = NULL;
> +        mvn_head->mvn_next = NULL;
> +        d->arch.dirty.head = page_head;
> +        d->arch.dirty.mvn_head = mvn_head;
> +    }
> +
> +    if ( mvn_head->items == MAX_VA_PER_NODE )
> +    {
> +        struct page_info *page;
> +
> +        page = alloc_domheap_page(NULL, 0);
> +        if ( !page )
> +        {
> +            spin_unlock(&d->arch.dirty.lock);
> +            return -ENOMEM;
> +        }
> +
> +        mvn_head = map_domain_page_global(__page_to_mfn(page));
> +        mvn_head->items = 0;
> +        mvn_head->next = d->arch.dirty.head;
> +        mvn_head->mvn_next = d->arch.dirty.mvn_head;
> +
> +        d->arch.dirty.mvn_head = mvn_head;
> +        d->arch.dirty.head = page;
> +    }
> +
> +    mvn_head->vaddrs[mvn_head->items] = va;
> +    mvn_head->items ++;
> +
> +    spin_unlock(&d->arch.dirty.lock);
> +    return 0;
> +}
> +
> +/* get the dirty bitmap from the linked list stored by add_mapped_vaddr
> + * and also clear the mapped vaddrs linked list */
> +static void get_dirty_bitmap(struct domain *d, uint8_t *bitmap[])
> +{
> +    struct page_info *page_head;
> +    struct mapped_va_node *mvn_head;
> +    struct page_info *tmp_page;
> +    struct mapped_va_node *tmp_mvn;
> +    vaddr_t gma_start = 0;
> +    vaddr_t gma_end = 0;
> +
> +    /* this hypercall is called from domain 0, and we don't know which 
> guest's
> +     * vlpt is mapped in xen_second, so, to be sure, we restore vlpt here */
> +    restore_vlpt(d);

AFAICT you don't actually use the vlpt here, just the domains list of
dirty addresses (which as above could become a bitmap)

> +
> +    spin_lock(&d->arch.dirty.lock);
> +
> +    page_head = d->arch.dirty.head;
> +    mvn_head = d->arch.dirty.mvn_head;
> +    get_gma_start_end(d, &gma_start, &gma_end);
> +
> +    while ( page_head )
> +    {
> +        int i;
> +
> +        for ( i = 0; i < mvn_head->items; ++i )
> +        {
> +            unsigned int bit_offset;
> +            int bitmap_index, bitmap_offset;
> +            lpae_t *vlp2m_pte;
> +
> +            bit_offset = third_linear_offset(mvn_head->vaddrs[i] - 
> gma_start);
> +            bitmap_index = bit_offset >> (PAGE_SHIFT + 3);
> +            bitmap_offset = bit_offset & ((1ul << (PAGE_SHIFT + 3)) - 1);
> +
> +            __set_bit(bitmap_offset, bitmap[bitmap_index]);
> +
> +            vlp2m_pte = get_vlpt_3lvl_pte(mvn_head->vaddrs[i]);
> +            vlp2m_pte->p2m.write = 0;
> +        }
> +
> +        tmp_page = page_head;
> +        page_head = mvn_head->next;
> +
> +        tmp_mvn = mvn_head;
> +        mvn_head = mvn_head->mvn_next;
> +
> +        unmap_domain_page_global(tmp_mvn);
> +        free_domheap_page(tmp_page);
> +    }
> +
> +    d->arch.dirty.head = NULL;
> +    d->arch.dirty.mvn_head = NULL;
> +
> +    spin_unlock(&d->arch.dirty.lock);
> +}
> +
> +
> +/* Change types across all p2m entries in a domain */
> +static void p2m_change_entry_type_global(struct domain *d, enum mg nt)
> +{

Stefano had some patches to generalise create_p2m_entries() into a p2m
walker infrastructure in his series to add XENMEM_exchange_and_pin. That
series turned out to be unnecessary but it could be resurrected for use
here rather than recoding a new one.

> +    struct p2m_domain *p2m = &d->arch.p2m;
> +    vaddr_t ram_base;
> +    int i1, i2, i3;
> +    int first_index, second_index, third_index;
> +    lpae_t *first = __map_domain_page(p2m->first_level);
> +    lpae_t pte, *second = NULL, *third = NULL;
> +
> +    get_gma_start_end(d, &ram_base, NULL);
> +
> +    first_index = first_table_offset((uint64_t)ram_base);
> +    second_index = second_table_offset((uint64_t)ram_base);
> +    third_index = third_table_offset((uint64_t)ram_base);
> +
> +    BUG_ON( !first && "Can't map first level p2m." );
> +
> +    spin_lock(&p2m->lock);
> +
> +    for ( i1 = first_index; i1 < LPAE_ENTRIES*2; ++i1 )
> +    {
> +        lpae_walk_t first_pte = first[i1].walk;
> +        if ( !first_pte.valid || !first_pte.table )
> +            goto out;
> +
> +        second = map_domain_page(first_pte.base);
> +        BUG_ON( !second && "Can't map second level p2m.");
> +        for ( i2 = second_index; i2 < LPAE_ENTRIES; ++i2 )
> +        {
> +            lpae_walk_t second_pte = second[i2].walk;
> +
> +            /* a nasty hack for vlpt due to the difference
> +             * of page table entry layout between p2m and pt */
> +            second[i2].pt.ro = 0;

What is the hack here?

The issue is that the p2m second level, which is a table entry in the
p2m is installed into the Xen page tables where it ends up the third
level, treated as a block entry.

That's OK, because for both PT and P2M bits 2..10 are ignored for table
entries. So I think rather than this hack here we should simply ensure
that our non-leaf p2m's have the correct bits in them to be used as p2m
table entries.

> +
> +            if ( !second_pte.valid || !second_pte.table )
> +                goto out;
> +
> +            third = map_domain_page(second_pte.base);
> +            BUG_ON( !third && "Can't map third level p2m.");
> +
> +            for ( i3 = third_index; i3 < LPAE_ENTRIES; ++i3 )
> +            {
> +                lpae_walk_t third_pte = third[i3].walk;
> +                int write = 0;
> +                if ( !third_pte.valid )
> +                    goto out;
> +
> +                pte = third[i3];
> +                if ( pte.p2m.write == 1 && nt == mg_ro )
> +                {
> +                    pte.p2m.write = 0;
> +                    write = 1;
> +                }
> +                else if ( pte.p2m.write == 0 && nt == mg_rw )
> +                {
> +                    pte.p2m.write = 1;
> +                    write = 1;
> +                }
> +                if ( write )
> +                    write_pte(&third[i3], pte);
> +            }
> +            unmap_domain_page(third);
> +
> +            third = NULL;
> +            third_index = 0;
> +        }
> +        unmap_domain_page(second);
> +
> +        second = NULL;
> +        second_index = 0;
> +        third_index = 0;
> +    }
> +
> +out:
> +    flush_tlb_all_local();
> +    if ( third ) unmap_domain_page(third);
> +    if ( second ) unmap_domain_page(second);
> +    if ( first ) unmap_domain_page(first);
> +
> +    spin_unlock(&p2m->lock);
> +}
> +
> +/* Read a domain's log-dirty bitmap and stats.
> + * If the operation is a CLEAN, clear the bitmap and stats. */
> +int log_dirty_op(struct domain *d, xen_domctl_shadow_op_t *sc)

Can this not be static?

> +{
> +    unsigned long gmfn_start;
> +    unsigned long gmfn_end;
> +    unsigned long gmfns;
> +    unsigned int bitmap_pages;
> +    int rc = 0, clean = 0, peek = 1;
> +    uint8_t *bitmap[256]; /* bitmap[256] covers 32GB ram */
> +    int i;
> +
> +    BUG_ON( !d->arch.map_domain.nr_banks );
> +
> +    gmfn_start = d->arch.map_domain.bank[0].start >> PAGE_SHIFT;
> +    gmfn_end = domain_get_maximum_gpfn(d);
> +    gmfns = gmfn_end - gmfn_start;
> +    bitmap_pages = PFN_UP((gmfns + 7) / 8);
> +
> +    if ( guest_handle_is_null(sc->dirty_bitmap) )
> +    {
> +        peek = 0;
> +    }
> +    else
> +    {
> +        /* mapping to the bitmap from guest param */
> +        vaddr_t to = (vaddr_t)sc->dirty_bitmap.p;

This is not allowed, please use the guest handle interfaces and
copy_(to/from)_(user/guest) instead of diving into the guest handle
struct and open coding it yourself.

This might mean creating a temporary bitmap in hypervisor space, but if
you maintain the bitmap across the whole dirty period as suggested then
you should be able to simply copy it out.

Actually, for consistency you might need an atomic copy and clear
mechanism (otherwise you can perhaps loose updates), in which case
you'll need a temporary buffer.

> +
> +        BUG_ON( to & ~PAGE_MASK && "offset not aligned to PAGE SIZE");
> +
> +        for ( i = 0; i < bitmap_pages; ++i )
> +        {
> +            paddr_t g;
> +            rc = gvirt_to_maddr(to, &g);
> +            if ( rc )
> +                return rc;
> +            bitmap[i] = map_domain_page(g>>PAGE_SHIFT);
> +            memset(bitmap[i], 0x00, PAGE_SIZE);
> +            to += PAGE_SIZE;
> +        }
> +    }
> +
> +    clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
> +    sc->stats.dirty_count = d->arch.dirty.count;
> +
> +    get_dirty_bitmap(d, bitmap);
> +    if ( peek )
> +    {
> +        for ( i = 0; i < bitmap_pages; ++i )
> +        {
> +            unmap_domain_page(bitmap[i]);
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc)
> +{
> +    long ret = 0;
> +    switch (sc->op)
> +    {
> +        case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
> +        case XEN_DOMCTL_SHADOW_OP_OFF:
> +        {
> +            enum mg nt = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? mg_rw : mg_ro;
> +
> +            d->arch.dirty.mode = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? 0 : 1;
> +            p2m_change_entry_type_global(d, nt);
> +
> +            if ( sc->op == XEN_DOMCTL_SHADOW_OP_OFF )
> +                cleanup_vlpt(d);
> +            else
> +                prepare_vlpt(d);
> +        }
> +        break;
> +
> +        case XEN_DOMCTL_SHADOW_OP_CLEAN:
> +        case XEN_DOMCTL_SHADOW_OP_PEEK:
> +        {
> +            ret = log_dirty_op(d, sc);
> +        }
> +        break;
> +
> +        default:
> +            return -ENOSYS;
> +    }
> +    return ret;
> +}
> +
>  /*
>   * Local variables:
>   * mode: C
> diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
> index 4c0fc32..3b78ed2 100644
> --- a/xen/arch/arm/traps.c
> +++ b/xen/arch/arm/traps.c
> @@ -1313,6 +1313,8 @@ static void do_trap_data_abort_guest(struct 
> cpu_user_regs *regs,
>      const char *msg;
>      int rc, level = -1;
>      mmio_info_t info;
> +    int page_fault = ((dabt.dfsc & FSC_MASK) ==
> +                      (FSC_FLT_PERM + FSC_3D_LEVEL) && dabt.write);
>  
>      if ( !check_conditional_instr(regs, hsr) )
>      {
> @@ -1327,22 +1329,23 @@ static void do_trap_data_abort_guest(struct 
> cpu_user_regs *regs,
>      info.gva = READ_SYSREG64(FAR_EL2);
>  #endif
>  
> -    if (dabt.s1ptw)
> +    if ( dabt.s1ptw && !page_fault )
>          goto bad_data_abort;

I see this has been commented on elsewhere.

Ian.


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.