[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v3 09/10] xen/arm: Implement hypercall for dirty page tracing (shadow op)



Add hypercall (shadow op: enable/disable and clean/peek dirted page bitmap).

For generating the dirty-bitmap, loop over the xen's page table mapped to guest 
p2m.
In this way, we don't need to map/unmap domain page for guest p2m.

For unmapping the guest p2m slotted into xen's page table after finishing live 
migration,
we implement add_mapped_vaddr for storing the write-faulting addresses. In 
destroy_all_mapped_vaddrs function,
the actual unmap happens.

Signed-off-by: Jaeyong Yoo <jaeyong.yoo@xxxxxxxxxxx>
---
 xen/arch/arm/domain.c     |   7 ++
 xen/arch/arm/domctl.c     |  13 ++
 xen/arch/arm/mm.c         |  95 ++++++++++++++
 xen/arch/arm/p2m.c        | 307 ++++++++++++++++++++++++++++++++++++++++++++++
 xen/include/asm-arm/mm.h  |   1 +
 xen/include/asm-arm/p2m.h |   4 +
 6 files changed, 427 insertions(+)

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index e9cfc81..b629988 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -512,6 +512,13 @@ int arch_domain_create(struct domain *d, unsigned int 
domcr_flags)
     spin_lock_init(&d->arch.map_lock);
     d->arch.map_domain.nr_banks = 0;
 
+    /* init for dirty-page tracing */
+    d->arch.dirty.count = 0;
+    d->arch.dirty.gmfn_guest_start = 0;
+    d->arch.dirty.vlpt_start = NULL;
+    d->arch.dirty.vlpt_end = NULL;
+    d->arch.dirty.head = NULL;
+
     clear_page(d->shared_info);
     share_xen_page_with_guest(
         virt_to_page(d->shared_info), d, XENSHARE_writable);
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index 9cfb48a..87c5184 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -93,6 +93,19 @@ long arch_do_domctl(struct xen_domctl *domctl, struct domain 
*d,
             xfree(c.data);
     }
     break;
+    case XEN_DOMCTL_shadow_op:
+    {
+        domain_pause(d);
+        ret = dirty_mode_op(d, &domctl->u.shadow_op);
+        domain_unpause(d);
+
+        if ( (&domctl->u.shadow_op)->op == XEN_DOMCTL_SHADOW_OP_CLEAN ||
+             (&domctl->u.shadow_op)->op == XEN_DOMCTL_SHADOW_OP_PEEK )
+        {
+            copyback = 1;
+        }
+    }
+    break;
 
     default:
         return -EINVAL;
diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
index a24afe6..cd7bdff 100644
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -1304,6 +1304,9 @@ int handle_page_fault(struct domain *d, paddr_t addr)
         pte.pt.table = 1; /* 4k mappings always have this bit set */
         write_pte(&xen_third[xen_third_table], pte);
         flush_xen_data_tlb_range_va(va, PAGE_SIZE);
+
+        /* in order to remove mappings in free stage */
+        add_mapped_vaddr(d, va);
     }
 
     /* at this point, xen third level pt has valid entry: means we can access
@@ -1322,6 +1325,98 @@ out:
     return rc;
 }
 
+int get_dirty_bitmap(struct domain *d, uint8_t *bitmap[], int peek, int clean)
+{
+    vaddr_t vlpt_start = (vaddr_t)d->arch.dirty.vlpt_start;
+    vaddr_t vlpt_end = (vaddr_t)d->arch.dirty.vlpt_end;
+    int xen_second_linear_start, xen_second_linear_end;
+    int xen_third_table_start, xen_third_table_end;
+    int i1, i2, i3;
+
+    xen_second_linear_start = second_linear_offset((unsigned long)vlpt_start);
+    xen_second_linear_end = second_linear_offset((unsigned long)vlpt_end) + 1;
+
+    for ( i1 = xen_second_linear_start; i1 < xen_second_linear_end; i1++ )
+    {
+        vaddr_t xen_second_start_va;
+        int i1_offset = 0;
+        lpae_t *xen_third;
+
+        /* if xen_second page table does not have valid entry, it means,
+         * the corresponding region is not dirtied, so we do nothing */
+        if ( !xen_second[i1].pt.valid )
+            continue;
+
+        xen_second_start_va = i1 << (LPAE_SHIFT + PAGE_SHIFT);
+
+        /* since vlpt is partialy laying over xen_second,
+           we need to find the start index of third */
+        if ( vlpt_start > xen_second_start_va )
+        {
+            xen_third_table_start = third_table_offset(vlpt_start);
+            i1_offset = (vlpt_start - xen_second_start_va) / sizeof(lpae_t);
+        }
+        else
+            xen_third_table_start = 0;
+
+        if ( vlpt_end < xen_second_start_va +
+                        (1ul << (LPAE_SHIFT + PAGE_SHIFT)) )
+            xen_third_table_end = third_table_offset(vlpt_end) + 1;
+        else
+            xen_third_table_end = LPAE_ENTRIES;
+
+        xen_third = __va(pfn_to_paddr(xen_second[i1].pt.base));
+
+        for ( i2 = xen_third_table_start; i2 < xen_third_table_end; i2 ++ )
+        {
+            lpae_t *guest_third;
+            if ( !xen_third[i2].pt.valid )
+                continue;
+
+            guest_third = (lpae_t *)((i1 << (LPAE_SHIFT+PAGE_SHIFT))
+                                      + (i2 << PAGE_SHIFT));
+            for ( i3 = 0; i3 < LPAE_ENTRIES; i3++ )
+            {
+                lpae_t pte;
+                lpae_walk_t third_pte = guest_third[i3].walk;
+                int write = 0;
+                int bit_offset;
+                if ( !third_pte.valid )
+                    return -EINVAL;
+
+                pte = guest_third[i3];
+                if ( peek && pte.p2m.avail )
+                {
+                    int bitmap_index;
+                    int bitmap_offset;
+                    bit_offset = (i1 - xen_second_linear_start) *
+                                     LPAE_ENTRIES * LPAE_ENTRIES +
+                                 i2 * LPAE_ENTRIES +
+                                 i3 -
+                                 i1_offset;
+
+                    bitmap_index = bit_offset >> (PAGE_SHIFT + 3);
+                    bitmap_offset = bit_offset & ((1ul << (PAGE_SHIFT + 3)) -
+                                                  1);
+                    __test_and_set_bit(bitmap_offset, bitmap[bitmap_index]);
+                    write = 1;
+                }
+                if ( clean && pte.p2m.write )
+                {
+                    pte.p2m.write = 0;
+                    pte.p2m.avail = 0;
+                    write = 1;
+                }
+                if ( write )
+                    write_pte(&guest_third[i3], pte);
+            }
+        }
+    }
+
+    flush_tlb_all_local();
+    return 0;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 307c6d4..c62a383 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -5,6 +5,9 @@
 #include <xen/domain_page.h>
 #include <asm/flushtlb.h>
 #include <asm/gic.h>
+#include <asm/vlpt.h>
+#include <xen/guest_access.h>
+#include <xen/pfn.h>
 
 void dump_p2m_lookup(struct domain *d, paddr_t addr)
 {
@@ -345,6 +348,310 @@ unsigned long gmfn_to_mfn(struct domain *d, unsigned long 
gpfn)
     return p >> PAGE_SHIFT;
 }
 
+static int alloc_vlpt_for_p2m(struct domain *d)
+{
+    unsigned long gmfn_start = 0, gmfn_end = 0, gmfns, pgts_3rd;
+    void *vlpt_start, *vlpt_end;
+    int nr_banks;
+
+    spin_lock(&d->arch.map_lock);
+    /* The guest memory map must be ordered by start addr */
+    nr_banks = d->arch.map_domain.nr_banks;
+    if ( nr_banks )
+    {
+        gmfn_start = d->arch.map_domain.bank[0].start >> PAGE_SHIFT;
+        gmfn_end = (d->arch.map_domain.bank[nr_banks - 1].start +
+            d->arch.map_domain.bank[nr_banks - 1].size) >> PAGE_SHIFT;
+    }
+    spin_unlock(&d->arch.map_lock);
+    gmfns = gmfn_end - gmfn_start;
+    pgts_3rd = (gmfns + LPAE_ENTRIES - 1) >> LPAE_SHIFT;
+
+    vlpt_start = vlpt_alloc(pgts_3rd, 1);
+
+    if ( !vlpt_start )
+    {
+        printk("Out of memory for allocating VLPT mapping\n");
+        goto out;
+    }
+
+    vlpt_end = vlpt_start + pgts_3rd*PAGE_SIZE;
+
+    d->arch.dirty.vlpt_start = vlpt_start;
+    d->arch.dirty.vlpt_end = vlpt_end;
+
+    d->arch.dirty.head = NULL;
+
+    return 0;
+out:
+    if ( vlpt_start ) vlpt_free(vlpt_start);
+    return -ENOMEM;
+}
+
+#define MAX_VA_PER_NODE (PAGE_SIZE - sizeof(struct page_info *) -\
+                         sizeof(int)) / sizeof(unsigned long)
+
+/* an array-based linked list for storing virtual addresses
+ * where the third-table mapping should be destroyed after
+ * live migration */
+struct mapped_va_node
+{
+    struct page_info *next;
+    int items;
+    unsigned long vaddrs[MAX_VA_PER_NODE];
+};
+
+int add_mapped_vaddr(struct domain *d, unsigned long va)
+{
+    struct page_info *head_page = d->arch.dirty.head;
+    struct mapped_va_node *mvn = NULL;
+
+    if ( !head_page )
+    {
+        head_page = alloc_domheap_page(NULL, 0);
+        if ( !head_page )
+            return -ENOMEM;
+
+        mvn = __map_domain_page(head_page);
+        mvn->items = 0;
+        mvn->next = NULL;
+        d->arch.dirty.head = head_page;
+    }
+
+    if ( !mvn )
+        mvn = __map_domain_page(head_page);
+
+    if ( mvn->items == MAX_VA_PER_NODE )
+    {
+        struct page_info *page;
+        unmap_domain_page(mvn);
+
+        page = alloc_domheap_page(NULL, 0);
+        if ( !page )
+            return -ENOMEM;
+
+        mvn = __map_domain_page(page);
+        mvn->items = 0;
+        mvn->next = head_page;
+
+        d->arch.dirty.head = page;
+    }
+
+    mvn->vaddrs[mvn->items] = va;
+    mvn->items ++;
+
+    unmap_domain_page(mvn);
+    return 0;
+}
+
+static void destroy_all_mapped_vaddrs(struct domain *d)
+{
+    struct page_info *head_page = d->arch.dirty.head;
+    struct mapped_va_node *mvn = NULL;
+
+    while ( head_page )
+    {
+        int i;
+        mvn = __map_domain_page(head_page);
+        head_page = mvn->next;
+
+        for ( i = 0; i < mvn->items; ++i )
+            destroy_xen_mappings(mvn->vaddrs[i], mvn->vaddrs[i] + PAGE_SIZE);
+
+        unmap_domain_page(mvn);
+    }
+
+    d->arch.dirty.head = NULL;
+}
+
+static void free_vlpt_for_p2m(struct domain *d)
+{
+    destroy_all_mapped_vaddrs(d);
+
+    vlpt_free(d->arch.dirty.vlpt_start);
+    d->arch.dirty.vlpt_start = NULL;
+    d->arch.dirty.vlpt_end = NULL;
+    d->arch.dirty.head = NULL;
+}
+
+/* Change types across all p2m entries in a domain */
+static void p2m_change_entry_type_global(struct domain *d, enum mg nt)
+{
+    struct p2m_domain *p2m = &d->arch.p2m;
+    uint64_t ram_base = 0;
+    int i1, i2, i3;
+    int first_index, second_index, third_index;
+    lpae_t *first = __map_domain_page(p2m->first_level);
+    lpae_t pte, *second = NULL, *third = NULL;
+
+    spin_lock(&d->arch.map_lock);
+    /*Suppose that first map base is a guest's RAM base */
+    if ( d->arch.map_domain.nr_banks )
+        ram_base = d->arch.map_domain.bank[0].start;
+    spin_unlock(&d->arch.map_lock);
+    first_index = first_table_offset(ram_base);
+    second_index = second_table_offset(ram_base);
+    third_index = third_table_offset(ram_base);
+
+    BUG_ON( !ram_base && "RAM base is undefined" );
+    BUG_ON( !first && "Can't map first level p2m." );
+
+    spin_lock(&p2m->lock);
+
+    for ( i1 = first_index; i1 < LPAE_ENTRIES*2; ++i1 )
+    {
+        lpae_walk_t first_pte = first[i1].walk;
+        if ( !first_pte.valid || !first_pte.table )
+            goto out;
+
+        second = map_domain_page(first_pte.base);
+        BUG_ON( !second && "Can't map second level p2m.");
+        for ( i2 = second_index; i2 < LPAE_ENTRIES; ++i2 )
+        {
+            lpae_walk_t second_pte = second[i2].walk;
+            if ( !second_pte.valid || !second_pte.table )
+                goto out;
+
+            third = map_domain_page(second_pte.base);
+            BUG_ON( !third && "Can't map third level p2m.");
+
+            for ( i3 = third_index; i3 < LPAE_ENTRIES; ++i3 )
+            {
+                lpae_walk_t third_pte = third[i3].walk;
+                int write = 0;
+                if ( !third_pte.valid )
+                    goto out;
+
+                pte = third[i3];
+                if ( pte.p2m.write == 1 && nt == mg_ro )
+                {
+                    pte.p2m.write = 0;
+                    write = 1;
+                }
+                else if ( pte.p2m.write == 0 && nt == mg_rw )
+                {
+                    pte.p2m.write = 1;
+                    write = 1;
+                }
+                if ( write )
+                    write_pte(&third[i3], pte);
+            }
+            unmap_domain_page(third);
+
+            third = NULL;
+            third_index = 0;
+        }
+        unmap_domain_page(second);
+
+        second = NULL;
+        second_index = 0;
+        third_index = 0;
+    }
+
+out:
+    flush_tlb_all_local();
+    if ( third ) unmap_domain_page(third);
+    if ( second ) unmap_domain_page(second);
+    if ( first ) unmap_domain_page(first);
+
+    spin_unlock(&p2m->lock);
+}
+
+/* Read a domain's log-dirty bitmap and stats.
+ * If the operation is a CLEAN, clear the bitmap and stats. */
+int log_dirty_op(struct domain *d, xen_domctl_shadow_op_t *sc)
+{
+    unsigned long gmfn_start;
+    unsigned long gmfn_end;
+    unsigned long gmfns;
+    unsigned int bitmap_pages;
+    int rc = 0, clean = 0, peek = 1;
+    uint8_t *bitmap[256]; /* bitmap[256] covers 32GB ram */
+    int i;
+
+    BUG_ON( !d->arch.map_domain.nr_banks );
+
+    gmfn_start = d->arch.map_domain.bank[0].start >> PAGE_SHIFT;
+    gmfn_end = domain_get_maximum_gpfn(d);
+    gmfns = gmfn_end - gmfn_start;
+    bitmap_pages = PFN_UP((gmfns + 7) / 8);
+
+    if ( guest_handle_is_null(sc->dirty_bitmap) )
+    {
+        peek = 0;
+    }
+    else
+    {
+        /* prepare a mapping to the bitmap from guest param */
+        vaddr_t to = (vaddr_t)sc->dirty_bitmap.p; /* TODO: use macro */
+
+        BUG_ON( to & ~PAGE_MASK && "offset not aligned to PAGE SIZE");
+
+        for ( i = 0; i < bitmap_pages; ++i )
+        {
+            paddr_t g;
+            rc = gvirt_to_maddr(to, &g);
+            if ( rc )
+                return rc;
+            bitmap[i] = map_domain_page(g>>PAGE_SHIFT);
+            memset(bitmap[i], 0x00, PAGE_SIZE);
+            to += PAGE_SIZE;
+        }
+    }
+
+    clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN);
+
+    sc->stats.dirty_count = d->arch.dirty.count;
+
+    spin_lock(&d->arch.dirty.lock);
+
+    get_dirty_bitmap(d, bitmap, peek, clean);
+
+    if ( peek )
+    {
+        for ( i = 0; i < bitmap_pages; ++i )
+        {
+            unmap_domain_page(bitmap[i]);
+        }
+    }
+    spin_unlock(&d->arch.dirty.lock);
+
+    return 0;
+}
+
+long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc)
+{
+    long ret = 0;
+    switch (sc->op)
+    {
+        case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
+        case XEN_DOMCTL_SHADOW_OP_OFF:
+        {
+            enum mg nt = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? mg_rw : mg_ro;
+
+            d->arch.dirty.mode = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? 0 : 1;
+            p2m_change_entry_type_global(d, nt);
+
+            if ( sc->op == XEN_DOMCTL_SHADOW_OP_OFF )
+                free_vlpt_for_p2m(d);
+            else
+                ret = alloc_vlpt_for_p2m(d);
+        }
+        break;
+
+        case XEN_DOMCTL_SHADOW_OP_CLEAN:
+        case XEN_DOMCTL_SHADOW_OP_PEEK:
+        {
+            ret = log_dirty_op(d, sc);
+        }
+        break;
+
+        default:
+            return -ENOSYS;
+    }
+    return ret;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
index fd976e3..be67349 100644
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -332,6 +332,7 @@ enum mg { mg_clear, mg_ro, mg_rw, mg_rx };
 
 /* routine for dirty-page tracing */
 int handle_page_fault(struct domain *d, paddr_t addr);
+int get_dirty_bitmap(struct domain *d, uint8_t *bitmap[], int peek, int clean);
 
 #endif /*  __ARCH_ARM_MM__ */
 /*
diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
index a00069b..fe33360 100644
--- a/xen/include/asm-arm/p2m.h
+++ b/xen/include/asm-arm/p2m.h
@@ -2,6 +2,7 @@
 #define _XEN_P2M_H
 
 #include <xen/mm.h>
+#include <public/domctl.h>
 
 struct domain;
 
@@ -107,6 +108,9 @@ static inline int get_page_and_type(struct page_info *page,
     return rc;
 }
 
+long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc);
+int add_mapped_vaddr(struct domain *d, unsigned long va);
+
 #endif /* _XEN_P2M_H */
 
 /*
-- 
1.8.1.2


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.