[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [RFC][PATCH 05/13] Kemari: Kemari sender



This patch implements a program to send an HVM guest image communicating with
the VMM part of Kemari.  After sending the whole guest memory first, it keeps
sending the delta of the guest status when specified events are detected.

Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@xxxxxxxxxxxxx>
Signed-off-by: Yoshi Tamura <tamura.yoshiaki@xxxxxxxxxxxxx>
---
 tools/libxc/xc_dom_kemari_save.c | 1124 +++++++++++++++++++++++++++++++++++++++
 tools/xcutils/xc_kemari_save.c   |  518 +++++++++++++++++
 2 files changed, 1642 insertions(+)

diff -r 19201eebab16 tools/libxc/xc_dom_kemari_save.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_dom_kemari_save.c  Thu Mar 05 12:48:19 2009 +0900
@@ -0,0 +1,1124 @@
+/******************************************************************************
+ * xc_dom_kemari_save.c
+ *
+ * Save the state of a running Linux session.
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * This source code is based on xc_domain_save.c.
+ * Copied BITS_PER_LONG, BITS_TO_LONGS, BITMAP_SIZE, BITMAP_SHIFT,
+ * RATE_IS_MAX, test_bit, clear_bit, set_bit, tv_delta, noncached_write,
+ * initialize_mbit_rate, and ratewrite from xc_domain_save.c
+ *
+ * Copyright (c) 2003, K A Fraser.
+ */
+
+#include <inttypes.h>
+#include <time.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/time.h>
+
+#include "xc_private.h"
+#include "xc_dom.h"
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+#include <xen/hvm/params.h>
+#include "xc_e820.h"
+
+#ifdef  __MINIOS__
+/*
+ * Caution: atomicity of following alternative libc functions are broken.
+ */
+static ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
+{
+    char buf[1024];
+    int len, wrote_len = 0;
+
+    if (offset != NULL) {
+        ERROR("Sorry sendfile for stubdomain should not have offset");
+        errno = EIO;
+        return -1;
+    }
+
+    while (count > 0) {
+        len = (count < sizeof(buf))?count:sizeof(buf);
+        len = read(in_fd, buf, len);
+        if (len < 0)
+            return -1;
+        if (write_exact(out_fd, buf, len))
+            return -1;
+        wrote_len += len;
+        count -= len;
+    }
+    return wrote_len;
+}
+
+#define IOV_MAX 1024
+struct iovec {
+    void *iov_base; /* Base address. */
+    size_t iov_len; /* Length. */
+};
+static ssize_t writev(int d, const struct iovec *iov, int iovcnt)
+{
+    int i;
+    int len, wrote_len;
+
+    if (iovcnt < 0 || iovcnt > IOV_MAX) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    for (i = 0, wrote_len = 0; i < iovcnt; i++) {
+        len = write(d, iov[i].iov_base, iov[i].iov_len);
+        if (len < 0)
+            return -1;
+
+        wrote_len += len;
+        if (wrote_len < 0) { /* integer overflow */
+            errno = EINVAL;
+            return -1;
+        }
+
+        if (len != iov[i].iov_len)
+            return wrote_len;
+    }
+
+    return wrote_len;
+}
+#else  /* !__MINIOS__ */
+#include <sys/sendfile.h>
+#include <sys/uio.h>
+#endif  /* __MINIOS__ */
+
+/* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */
+static unsigned long *qemu_bitmaps[2];
+static int qemu_active;
+static int qemu_non_active;
+
+/* number of pfns this guest has (i.e. number of entries in the P2M) */
+static unsigned long p2m_size;
+
+/* page frame numbers */
+static unsigned long *pfn_type = NULL;
+
+/* The new domain's shared-info frame number. */
+static unsigned long shared_info_frame;
+
+/*
+ * guest memory
+ */
+#define GUEST_MEM_ENTRY_SIZE    1024 /* up to 4MB at a time. */
+static unsigned char ** guest_memory = NULL;
+static unsigned long ** guest_memory_status = NULL;
+static unsigned long guest_memory_size = 0;
+
+static inline int map_guest_mem(int xc_handle, uint32_t domid,
+    unsigned long base)
+{
+    int j;
+    unsigned char * region_base;
+    unsigned long * pfn_base;
+
+    pfn_base = guest_memory_status[base];
+
+    memset(pfn_base, 0, GUEST_MEM_ENTRY_SIZE);
+    for (j = 0; j < GUEST_MEM_ENTRY_SIZE; j++) {
+        pfn_base[j] = base * GUEST_MEM_ENTRY_SIZE + j;
+    }
+    region_base = xc_map_foreign_batch(
+        xc_handle, domid, PROT_READ, pfn_base, GUEST_MEM_ENTRY_SIZE);
+    if ( region_base == NULL )
+    {
+        PERROR("map failed at guest memory frame 0x%lx - 0x%lx (%lu)",
+            base * GUEST_MEM_ENTRY_SIZE, (base + 1)* GUEST_MEM_ENTRY_SIZE - 1,
+            base);
+        return -1;
+    }
+
+    /* Look for and skip completely empty batches. */
+    for ( j = 0; j < GUEST_MEM_ENTRY_SIZE; j++ )
+        pfn_base[j] &= XEN_DOMCTL_PFINFO_LTAB_MASK;
+    for ( j = 0; j < GUEST_MEM_ENTRY_SIZE; j++ )
+        if ( pfn_base[j] != XEN_DOMCTL_PFINFO_XTAB )
+            break;
+    if ( j == GUEST_MEM_ENTRY_SIZE )
+    {
+        munmap(region_base, GUEST_MEM_ENTRY_SIZE*PAGE_SIZE);
+        guest_memory[base] = NULL;
+        return 1;
+    }
+
+    guest_memory[base] = region_base;
+
+    return 0;
+}
+
+static inline unsigned char * search_guest_mem(int xc_handle, uint32_t domid,
+    unsigned long mfn)
+{
+    unsigned long base = mfn / GUEST_MEM_ENTRY_SIZE;
+    unsigned long offset = mfn % GUEST_MEM_ENTRY_SIZE;
+
+    if (base >= guest_memory_size) {
+        ERROR("Error base(%lu) is greater than guest_memory_size(%lu)\n",
+            base, guest_memory_size);
+        return NULL;
+    }
+
+    if ( guest_memory_status[base][offset]  == XEN_DOMCTL_PFINFO_XTAB ) {
+        /* reload XTAB place */
+        munmap(guest_memory[base], GUEST_MEM_ENTRY_SIZE*PAGE_SIZE);
+        guest_memory[base] = NULL;
+        DPRINTF("guest_memory[%lu] (frame 0x%lx - 0x%lx) will be remapped\n",
+            base, base * GUEST_MEM_ENTRY_SIZE,
+            (base + 1) * GUEST_MEM_ENTRY_SIZE - 1);
+    }
+
+    if (guest_memory[base] == NULL)
+        if (map_guest_mem(xc_handle, domid, offset))
+            return NULL;
+
+    return guest_memory[base] + offset * PAGE_SIZE;
+    /* Since I don't care of XEN_DOMCTL_PFINFO_LTAB_MASK,
+        this program may cause some accidents. */
+}
+
+static inline int init_guest_mem(int xc_handle, uint32_t dom)
+{
+    int i;
+
+    guest_memory_size = p2m_size / GUEST_MEM_ENTRY_SIZE + 1;
+    DPRINTF("guest_memory_size: %lu\n", guest_memory_size);
+
+    /* mapped memory */
+    guest_memory = xg_memalign(PAGE_SIZE,
+        guest_memory_size * sizeof(guest_memory[0]));
+    if (guest_memory == NULL)
+    {
+        PERROR("failed to allocate guest_memory");
+        return -1;
+    }
+    if ( lock_pages(guest_memory, guest_memory_size * sizeof(guest_memory[0])))
+    {
+        ERROR("Unable to lock guest_memory array");
+        return -1;
+    }
+
+    /* memory status */
+    guest_memory_status   = xg_memalign(PAGE_SIZE,
+        guest_memory_size * sizeof(guest_memory_status[0]));
+    if ( guest_memory_status == NULL )
+    {
+        ERROR("failed to alloc memory for guest_memory_status");
+        errno = ENOMEM;
+        return -1;
+    }
+    if ( lock_pages(guest_memory_status,
+        guest_memory_size * sizeof(guest_memory_status[0])))
+    {
+        ERROR("Unable to lock guest_memory_status array");
+        return -1;
+    }
+
+    for (i = 0; i < guest_memory_size; i++) {
+        guest_memory_status[i] = xg_memalign(PAGE_SIZE,
+            GUEST_MEM_ENTRY_SIZE * sizeof(guest_memory_status[0][0]));
+        if (guest_memory_status[i] == NULL) {
+            ERROR("failed to alloc memory for guest_memory_status[%d]", i);
+            errno = ENOMEM;
+            return -1;
+        }
+        if ( lock_pages(guest_memory_status,
+            guest_memory_size * sizeof(guest_memory_status[0][0])))
+        {
+            ERROR("Unable to lock guest_memory_status[%d]", i);
+            return -1;
+        }
+    }
+
+    for (i = 0; i < guest_memory_size; i++)
+        if (map_guest_mem(xc_handle, dom, i) < 0)
+            return -1;
+
+    return 0;
+}
+
+static int writev_exact(int fd, const struct iovec *iov, size_t count)
+{
+    int i;
+    size_t sum;
+    for (i = 0, sum = 0; i < count; i++)
+        sum += iov[i].iov_len;
+
+    if (writev(fd, iov, count) != sum)
+        return -1;
+    else
+        return 0;
+}
+
+/* grep fodder: machine_to_phys */
+
+
+/*
+** During (live) save/migrate, we maintain a number of bitmaps to track
+** which pages we have to send, to fixup, and to skip.
+*/
+
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+#define BITMAP_SIZE   (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long))
+
+#define BITMAP_ENTRY(_nr,_bmap) \
+   ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
+
+#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+
+static inline int test_bit (int nr, volatile void * addr)
+{
+    return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
+}
+
+static inline void clear_bit (int nr, volatile void * addr)
+{
+    BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
+}
+
+static inline void set_bit ( int nr, volatile void * addr)
+{
+    BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
+}
+
+static uint64_t tv_delta(struct timeval *new, struct timeval *old)
+{
+    return (((new->tv_sec - old->tv_sec)*1000000) +
+            (new->tv_usec - old->tv_usec));
+}
+
+static int noncached_write(int fd, void *buffer, int len)
+{
+    static int write_count = 0;
+    int rc = (write_exact(fd, buffer, len) == 0) ? len : -1;
+
+    write_count += len;
+    if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
+    {
+        /* Time to discard cache - dont care if this fails */
+        discard_file_cache(fd, 0 /* no flush */);
+        write_count = 0;
+    }
+
+    return rc;
+}
+
+#ifdef ADAPTIVE_SAVE
+
+/*
+** We control the rate at which we transmit (or save) to minimize impact
+** on running domains (including the target if we're doing live migrate).
+*/
+
+#define MAX_MBIT_RATE    500      /* maximum transmit rate for migrate */
+#define START_MBIT_RATE  100      /* initial transmit rate for migrate */
+
+/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
+#define RATE_TO_BTU      781250
+
+/* Amount in bytes we allow ourselves to send in a burst */
+#define BURST_BUDGET (100*1024)
+
+/* We keep track of the current and previous transmission rate */
+static int mbit_rate, ombit_rate = 0;
+
+/* Have we reached the maximum transmission rate? */
+#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
+
+static inline void initialize_mbit_rate()
+{
+    mbit_rate = START_MBIT_RATE;
+}
+
+static int ratewrite(int io_fd, void *buf, int n)
+{
+    static int budget = 0;
+    static int burst_time_us = -1;
+    static struct timeval last_put = { 0 };
+    struct timeval now;
+    struct timespec delay;
+    long long delta;
+
+    if ( START_MBIT_RATE == 0 )
+        return noncached_write(io_fd, buf, n);
+
+    budget -= n;
+    if ( budget < 0 )
+    {
+        if ( mbit_rate != ombit_rate )
+        {
+            burst_time_us = RATE_TO_BTU / mbit_rate;
+            ombit_rate = mbit_rate;
+            DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
+                    mbit_rate, BURST_BUDGET, burst_time_us);
+        }
+        if ( last_put.tv_sec == 0 )
+        {
+            budget += BURST_BUDGET;
+            gettimeofday(&last_put, NULL);
+        }
+        else
+        {
+            while ( budget < 0 )
+            {
+                gettimeofday(&now, NULL);
+                delta = tv_delta(&now, &last_put);
+                while ( delta > burst_time_us )
+                {
+                    budget += BURST_BUDGET;
+                    last_put.tv_usec += burst_time_us;
+                    if ( last_put.tv_usec > 1000000 )
+                    {
+                        last_put.tv_usec -= 1000000;
+                        last_put.tv_sec++;
+                    }
+                    delta -= burst_time_us;
+                }
+                if ( budget > 0 )
+                    break;
+                delay.tv_sec = 0;
+                delay.tv_nsec = 1000 * (burst_time_us - delta);
+                while ( delay.tv_nsec > 0 )
+                    if ( nanosleep(&delay, &delay) == 0 )
+                        break;
+            }
+        }
+    }
+    return noncached_write(io_fd, buf, n);
+}
+
+#else /* ! ADAPTIVE SAVE */
+
+#define RATE_IS_MAX() (0)
+#define ratewrite(_io_fd, _buf, _n) noncached_write((_io_fd), (_buf), (_n))
+#define initialize_mbit_rate()
+
+#endif
+
+static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
+                       xc_shadow_op_stats_t *stats, int print)
+{
+    static struct timeval wall_last;
+    static long long      d0_cpu_last;
+    static long long      d1_cpu_last;
+
+    struct timeval        wall_now;
+    long long             wall_delta;
+    long long             d0_cpu_now, d0_cpu_delta;
+    long long             d1_cpu_now, d1_cpu_delta;
+
+    gettimeofday(&wall_now, NULL);
+
+    d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
+    d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
+
+    if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
+        DPRINTF("ARRHHH!!\n");
+
+    wall_delta = tv_delta(&wall_now,&wall_last)/1000;
+    if ( wall_delta == 0 )
+        wall_delta = 1;
+
+    d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
+    d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
+
+    if ( print )
+        DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
+                "dirtied %dMb/s %" PRId32 " pages\n",
+                wall_delta,
+                (int)((d0_cpu_delta*100)/wall_delta),
+                (int)((d1_cpu_delta*100)/wall_delta),
+                (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
+                (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
+                stats->dirty_count);
+
+#ifdef ADAPTIVE_SAVE
+    if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
+    {
+        mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
+            + 50;
+        if ( mbit_rate > MAX_MBIT_RATE )
+            mbit_rate = MAX_MBIT_RATE;
+    }
+#endif
+
+    d0_cpu_last = d0_cpu_now;
+    d1_cpu_last = d1_cpu_now;
+    wall_last   = wall_now;
+
+    return 0;
+}
+
+
+static int send_ident_pt(int xc_handle, int io_fd, uint32_t dom)
+{
+    struct {
+        int minusthree;
+        uint32_t pad;
+        uint64_t ident_pt;
+    } chunk = { -3, 0 };
+
+    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
+                     &chunk.ident_pt);
+
+    if ( (chunk.ident_pt != 0) &&
+         write_exact(io_fd, &chunk, sizeof(chunk)) )
+    {
+        PERROR("Error when writing the ident_pt for EPT guest");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int send_qemu_image(int xc_handle, int io_fd, uint32_t dom)
+{
+    char path[128];
+    struct stat st;
+    struct {
+        int minusfour;
+        uint32_t image_size;
+    } chunk = { -4, 0 };
+    int qemu_fd;
+    int rc = -1;
+
+    snprintf(path, sizeof(path), "/dev/shm/qemu-save.%d", dom);
+    if ((qemu_fd = open(path, O_RDONLY)) == -1)
+    {
+        PERROR("Error when opening qemu image %s", path);
+        goto out;
+    }
+
+    if (fstat(qemu_fd, &st) == -1)
+    {
+        PERROR("Error fstat qemu file %s", path);
+        goto out;
+    }
+    chunk.image_size = st.st_size;
+
+    if ( write_exact(io_fd, &chunk, sizeof(chunk)) )
+    {
+        PERROR("Error when writing header for qemu image");
+        goto out;
+    }
+
+    if ( sendfile(io_fd, qemu_fd, NULL, chunk.image_size) !=
+        chunk.image_size)
+    {
+        PERROR("Error when writing qemu image");
+        goto out;
+    }
+    close(qemu_fd);
+
+    rc = 0;
+out:
+    return rc;
+}
+
+static int send_hvm_context(int xc_handle, int io_fd,
+                            struct kemari_ring *ring, uint32_t dom)
+{
+    uint32_t buf_size = ring->hvm_ctxt.buf_size;
+    uint32_t rec_size = ring->hvm_ctxt.rec_size;
+    uint8_t *hvm_buf = (uint8_t *)ring + ring->hvm_ctxt.buf_offset;
+    int rc = -1;
+
+    /* Get HVM context from Xen and save it too */
+    if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf,
+                                              buf_size)) == -1 )
+    {
+        ERROR("HVM:Could not get hvm buffer");
+        goto out;
+    }
+
+    if ( write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
+    {
+        PERROR("error write hvm buffer size");
+        goto out;
+    }
+
+    if ( write_exact(io_fd, hvm_buf, rec_size) )
+    {
+        PERROR("write HVM info failed!\n");
+        goto out;
+    }
+    rc = 0;
+
+out:
+    return rc;
+}
+
+int xc_kemari_save(int xc_handle, int io_fd, uint32_t dom,
+                   void *kemari_ring, uint32_t flags,
+                   int hvm, void *(*init_qemu_maps)(int, unsigned))
+{
+    int rc = 1, i, j, iter = 0;
+    int debug = (flags & XCFLAGS_DEBUG);
+    int sent_last_iter, skip_this_iter;
+    xc_dominfo_t info;
+    struct kemari_ring *ring = (struct kemari_ring *)kemari_ring;
+
+    /* base of the region in which domain memory is mapped */
+    unsigned char *region_base = NULL;
+
+    /* bitmap of pages:
+       - that should be sent this iteration (unless later marked as skip);
+       - to skip this iteration because already dirty;
+       - to fixup by sending at the end if not already resent; */
+    unsigned long *to_send = NULL, *to_fix = NULL;
+
+    xc_shadow_op_stats_t stats;
+
+    unsigned long needed_to_fix = 0;
+    unsigned long total_sent    = 0;
+
+    /* HVM: magic frames for ioreqs and xenstore comms. */
+    uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
+
+    /* callback irq */
+    uint64_t callback_irq = 0;
+
+    if ( !hvm )
+    {
+        ERROR("HVM domain is required for the kemari migration.");
+        return 1;
+    }
+
+    initialize_mbit_rate();
+
+    if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
+    {
+        ERROR("Could not get domain info");
+        return 1;
+    }
+
+    shared_info_frame = info.shared_info_frame;
+    DPRINTF("xc_kemari_save: shared_info_frame: %lu\n", shared_info_frame);
+
+    /* Get the size of the P2M table */
+    p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom) + 1;
+    DPRINTF("xc_kemari_save: p2m_size: %lu\n", p2m_size);
+
+    /* Domain is still running at this point */
+    {
+        /* Get qemu-dm logging dirty pages too */
+        void *seg = init_qemu_maps(dom, BITMAP_SIZE);
+        qemu_bitmaps[0] = seg;
+        qemu_bitmaps[1] = seg + BITMAP_SIZE;
+        qemu_active = 0;
+        qemu_non_active = 1;
+    }
+
+    /* pretend we sent all the pages last iteration */
+    sent_last_iter = p2m_size;
+
+    /* Setup to_send / to_fix bitmaps */
+    to_send = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT));
+    to_fix  = calloc(1, BITMAP_SIZE);
+
+    if ( !to_send || !to_fix )
+    {
+        ERROR("Couldn't allocate to_send array");
+        goto out;
+    }
+
+    memset(to_send, 0xff, BITMAP_SIZE);
+
+    if ( lock_pages(to_send, BITMAP_SIZE) )
+    {
+        ERROR("Unable to lock to_send");
+        return 1;
+    }
+
+    pfn_type   = xg_memalign(PAGE_SIZE, ROUNDUP(
+                              MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
+    if ( pfn_type == NULL )
+    {
+        ERROR("failed to alloc memory for pfn_type arrays");
+        errno = ENOMEM;
+        goto out;
+    }
+    memset(pfn_type, 0,
+           ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
+
+    if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
+    {
+        ERROR("Unable to lock pfn_type array");
+        goto out;
+    }
+
+    /* Start writing out the saved-domain record. */
+    if ( write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
+    {
+        PERROR("write: p2m_size");
+        goto out;
+    }
+
+    /* send shared_info_frame */
+    if ( write_exact(io_fd, &shared_info_frame, sizeof(unsigned long)) )
+    {
+        PERROR("write: shared_info_frame");
+        goto out;
+    }
+
+    /* Save magic-page locations. */
+    memset(magic_pfns, 0, sizeof(magic_pfns));
+    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
+                     &magic_pfns[0]);
+    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
+                     &magic_pfns[1]);
+    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
+                     &magic_pfns[2]);
+    DPRINTF("kemari_restore: magic_pfns 0: %lld, 1: %lld, 2: %lld\n",
+        magic_pfns[0], magic_pfns[1], magic_pfns[2]);
+    if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
+    {
+        PERROR("Error when writing to state file (7)");
+        goto out;
+    }
+
+    xc_get_hvm_param(xc_handle, dom, HVM_PARAM_CALLBACK_IRQ,
+                     &callback_irq);
+    DPRINTF("kemari_restore: callback irq %llx", callback_irq);
+    if ( write_exact(io_fd, &callback_irq, sizeof(callback_irq)) )
+    {
+        PERROR("Error when writing to state file (8)");
+        goto out;
+    }
+
+    print_stats(xc_handle, dom, 0, &stats, 0);
+
+    /* Now write out each data page, canonicalising page tables as we go... */
+    {
+        unsigned int prev_pc, sent_this_iter, N, batch, run;
+
+        iter++;
+        sent_this_iter = 0;
+        skip_this_iter = 0;
+        prev_pc = 0;
+        N = 0;
+
+        DPRINTF("Saving memory pages: iter %d   0%%", iter);
+
+        while ( N < p2m_size )
+        {
+            unsigned int this_pc = (N * 100) / p2m_size;
+
+            if ( (this_pc - prev_pc) >= 5 )
+            {
+                DPRINTF("\b\b\b\b%3d%%", this_pc);
+                prev_pc = this_pc;
+            }
+
+            /* load pfn_type[] with the mfn of all the pages we're doing in
+               this batch. */
+            for  ( batch = 0;
+                   (batch < MAX_BATCH_SIZE) && (N < p2m_size);
+                   N++ )
+            {
+                int n = N;
+
+                if ( debug )
+                {
+                    DPRINTF("%d pfn= %08lx mfn= %08lx %d",
+                            iter, (unsigned long)n,
+                            (long unsigned int)0,
+                            test_bit(n, to_send));
+                    DPRINTF("\n");
+                }
+
+                if ( !( (test_bit(n, to_send)) || (test_bit(n, to_fix))) )
+                    continue;
+
+                /* Skip PFNs that aren't really there */
+                if (((n >= 0xa0 && n < 0xc0) /* VGA hole */
+                             || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT)
+                                 && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ 
) {
+                    if (n >= shared_info_frame && n <= shared_info_frame + 32) 
{
+                        /* DPRINTF("shared_info_frame or grant: %d\n", n); */
+                    } else {
+                        continue;
+                    }
+                }
+
+                /*
+                ** we get here if:
+                **  1. page is marked to_send & hasn't already been re-dirtied
+                **  2. add in pages that still need fixup (net bufs)
+                */
+
+                /* Hypercall interfaces operate in PFNs for HVM guests
+                * and MFNs for PV guests */
+                pfn_type[batch] = n;
+
+                if ( !is_mapped(pfn_type[batch]) )
+                {
+                    /*
+                    ** not currently in psuedo-physical map -- set bit
+                    ** in to_fix since we must send this page in last_iter
+                    ** unless its sent sooner anyhow, or it never enters
+                    ** pseudo-physical map (e.g. for ballooned down doms)
+                    */
+                    set_bit(n, to_fix);
+                    continue;
+                }
+
+                if ( test_bit(n, to_fix) &&
+                     !test_bit(n, to_send) )
+                {
+                    needed_to_fix++;
+                    DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
+                            iter, n, pfn_type[batch]);
+                }
+
+                clear_bit(n, to_fix);
+
+                batch++;
+            }
+
+            if ( batch == 0 )
+                goto skip; /* vanishingly unlikely... */
+
+            region_base = xc_map_foreign_batch(
+                xc_handle, dom, PROT_READ, pfn_type, batch);
+            if ( region_base == NULL )
+            {
+                ERROR("map batch failed");
+                goto out;
+            }
+
+            {
+                /* Look for and skip completely empty batches. */
+                for ( j = 0; j < batch; j++ )
+                    if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) !=
+                         XEN_DOMCTL_PFINFO_XTAB )
+                        break;
+                if ( j == batch )
+                {
+                    munmap(region_base, batch*PAGE_SIZE);
+                    continue; /* bail on this batch: no valid pages */
+                }
+            }
+
+            if ( write_exact(io_fd, &batch, sizeof(unsigned int)) )
+            {
+                PERROR("Error when writing to state file (2)");
+                goto out;
+            }
+
+            if ( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
+            {
+                PERROR("Error when writing to state file (3)");
+                goto out;
+            }
+
+            /* entering this loop, pfn_type is now in pfns (Not mfns) */
+            run = 0;
+            for ( j = 0; j < batch; j++ )
+            {
+                unsigned long pfn, pagetype;
+
+                pfn      = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+                pagetype = pfn_type[j] &  XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+                if ( pagetype != 0 )
+                {
+                    /* If the page is not a normal data page, write out any
+                       run of pages we may have previously acumulated */
+                    if ( run )
+                    {
+                        if ( ratewrite(io_fd,
+                                       (char*)region_base+(PAGE_SIZE*(j-run)),
+                                       PAGE_SIZE*run) != PAGE_SIZE*run )
+                        {
+                            ERROR("Error when writing to state file (4a)"
+                                  " (errno %d)", errno);
+                            goto out;
+                        }
+                        run = 0;
+                    }
+                }
+
+                /* skip pages that aren't present */
+                if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+                    continue;
+
+                pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+                if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
+                     (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
+                {
+                    DPRINTF("canonicalize_pagetable pagetype = %lx pfn = 
%lu\n", pagetype, pfn);
+                }
+                else
+                {
+                    /* We have a normal page: accumulate it for writing. */
+                    run++;
+                }
+            } /* end of the write out for this batch */
+
+            if ( run )
+            {
+                /* write out the last accumulated run of pages */
+                if ( ratewrite(io_fd,
+                               (char*)region_base+(PAGE_SIZE*(j-run)),
+                               PAGE_SIZE*run) != PAGE_SIZE*run )
+                {
+                    ERROR("Error when writing to state file (4c)"
+                          " (errno %d)", errno);
+                    goto out;
+                }
+            }
+
+            sent_this_iter += batch;
+
+            munmap(region_base, batch*PAGE_SIZE);
+
+        } /* end of this while loop for this iteration */
+
+      skip:
+
+        total_sent += sent_this_iter;
+
+        DPRINTF("\r %d: sent %d, skipped %d, ",
+                iter, sent_this_iter, skip_this_iter );
+
+        {
+            print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
+
+            DPRINTF("Total pages sent= %ld (%.2fx)\n",
+                    total_sent, ((float)total_sent)/p2m_size );
+            DPRINTF("(of which %ld were fixups)\n", needed_to_fix  );
+        }
+    } /* end of infinite for loop */
+
+    DPRINTF("All memory is saved\n");
+
+    if (send_ident_pt(xc_handle, io_fd, dom) < 0)
+        goto out;
+
+    /* Zero terminate */
+    i = 0;
+    if ( write_exact(io_fd, &i, sizeof(int)) )
+    {
+        PERROR("Error when writing to state file (6')");
+        goto out;
+    }
+
+    if (send_hvm_context(xc_handle, io_fd, ring, dom) < 0)
+        goto out;
+
+    if (!debug)
+    {
+        int rcv_status;
+        if ( read_exact(io_fd, &rcv_status, sizeof(int))) {
+            ERROR("Error when reading receiver status");
+            goto out;
+        }
+        DPRINTF("status received: %d\n", rcv_status);
+    }
+
+    if (init_guest_mem(xc_handle, dom) < 0)
+        goto out;
+
+    /* HVM guests are done now */
+    rc = 0;
+
+ out:
+
+    /* Flush last write and discard cache for file. */
+    discard_file_cache(io_fd, 1 /* flush */);
+
+    free(to_send);
+    free(to_fix);
+
+    DPRINTF("Save exit rc=%d\n",rc);
+
+    return !!rc;
+}
+
+
+int xc_kemari_update(int xc_handle, int io_fd, uint32_t dom,
+                     void *kemari_ring, uint32_t flags,
+                     void (*qemu_save_image)(int),
+                     void (*qemu_end_flip)(void),
+                     void (*qemu_end_save)(void),
+                     void (*qemu_image_sent)(void))
+{
+    int rc = 1, k;
+    int debug = (flags & XCFLAGS_DEBUG);
+    uint32_t i, j, index = 0;
+    unsigned int batch = 0;
+    struct kemari_ring *ring = (struct kemari_ring *)kemari_ring;
+    struct kemari_ent *buf;
+    struct iovec iov[MAX_BATCH_SIZE + 2]; /* 2 for batch and pfn_type */
+    int iovcnt = 2;
+
+#define ADD_IOV(base, len) do {                                         \
+    iov[iovcnt].iov_base = base;                                        \
+    iov[iovcnt].iov_len = len;                                          \
+    iovcnt++;                                                           \
+} while (0)
+
+
+
+    /* flip active qemu */
+    qemu_active = qemu_non_active;
+    qemu_non_active = qemu_active ? 0 : 1;
+    qemu_save_image(qemu_active);
+
+    /*
+     * main iteration starts from here
+     */
+    while (ring->cons < ring->prod) {
+
+        kemari_ring_read(ring, &buf);
+
+        for (i = buf->u.index.start, j = buf->u.index.end; i < j; i++) {
+
+            int next, offset = 0;
+
+            index = i * BITS_PER_LONG;
+
+            kemari_ring_read(ring, &buf);
+
+            while (buf->u.dirty_bitmap && offset < BITS_PER_LONG) {
+                int n;
+                next = ffs(buf->u.dirty_bitmap);
+                buf->u.dirty_bitmap >>= next;
+                offset += next;
+                n = offset + index - 1;
+                if (((n >= 0xa0 && n < 0xc0) /* VGA hole */
+                         || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT)
+                             && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ ) {
+                    if (n >= shared_info_frame && n <= shared_info_frame + 32) 
{
+                        ;
+                    } else {
+                        continue;
+                    }
+                }
+                ADD_IOV(search_guest_mem(xc_handle, dom, n), PAGE_SIZE);
+                pfn_type[batch] = n;
+                batch++;
+            }
+
+            if ((batch + BITS_PER_LONG - 1 < MAX_BATCH_SIZE) &&
+                !(ring->cons == ring->prod))
+                continue;
+
+            /* Pull in the dirty bits from qemu-dm too */
+            qemu_end_flip();
+            for ( k = 0; k < BITMAP_SIZE / BITS_PER_LONG; k++) {
+                if (qemu_bitmaps[qemu_non_active][k] != 0) {
+                    unsigned int bmp = qemu_bitmaps[qemu_non_active][k];
+
+                    index = k * BITS_PER_LONG;
+                    while (bmp && offset < BITS_PER_LONG) {
+                        int n, next, offset = 0;
+                        next = ffs(bmp);
+                        bmp >>= next;
+                        offset += next;
+                        n = offset + index - 1;
+
+                        ADD_IOV(search_guest_mem(xc_handle, dom, n), 
PAGE_SIZE);
+                        pfn_type[batch] = n;
+                        batch++;
+                    }
+                    qemu_bitmaps[qemu_non_active][k] = 0;
+                }
+                if (batch >= MAX_BATCH_SIZE) {
+                    ERROR("Sorry, reached MAX_BATCH_SIZE.  "
+                        "We will fix this lator.");
+                    goto out;
+                }
+            }
+
+            PPRINTF("batch %d\n", batch);
+
+            /* send pages */
+            iov[0].iov_base = &batch;
+            iov[0].iov_len = sizeof(batch);
+
+            iov[1].iov_base = pfn_type;
+            iov[1].iov_len = sizeof(pfn_type[0]) * batch;
+
+            for (k = 0; k < iovcnt / IOV_MAX + 1; k++) {
+                int count = (iovcnt<IOV_MAX*(k+1))?(iovcnt-IOV_MAX*k):IOV_MAX;
+                if (writev_exact(io_fd, &iov[IOV_MAX * k], count)) {
+                    ERROR("Error when writing pages state file (2--4)"
+                          " (errno %d)", errno);
+                    goto out;
+                }
+            }
+
+            batch = 0;
+        }
+    }
+
+    if (send_ident_pt(xc_handle, io_fd, dom) < 0)
+        goto out;
+    qemu_end_save();
+    if (!debug && send_qemu_image(xc_handle, io_fd, dom) < 0)
+        goto out;
+    qemu_image_sent();
+
+    /* Zero terminate */
+    i = 0;
+    if ( write_exact(io_fd, &i, sizeof(int)) )
+    {
+        PERROR("Error when writing to state file (6')");
+        goto out;
+    }
+
+    if (send_hvm_context(xc_handle, io_fd, ring, dom) < 0)
+        goto out;
+
+    if (!debug)
+    {
+        int rcv_status;
+        if ( read_exact(io_fd, &rcv_status, sizeof(int))) {
+            ERROR("Error when reading receiver status");
+            goto out;
+        }
+    }
+
+    rc = 0;
+out:
+
+    return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 19201eebab16 tools/xcutils/xc_kemari_save.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xcutils/xc_kemari_save.c    Wed Mar 04 17:04:25 2009 +0900
@@ -0,0 +1,518 @@
+/*
+ * xc_kemari_save.c
+ *
+ * Save the state of a running Linux session.
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file "COPYING" in the main directory of
+ * this archive for more details.
+ *
+ * This source code is based on xc_save.c.
+ * Copied qemu_destroy_buffer and init_qemu_maps from xc_save.c.
+ *
+ * Copyright (C) 2005 by Christian Limpach
+ *
+ */
+
+
+#include <err.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <xs.h>
+#include <xenctrl.h>
+#include <xenguest.h>
+#include <xc_private.h>
+#include <xen/kemari.h>
+
+static volatile sig_atomic_t run = 1;
+static int xc_handle, xce_handle, io_fd;
+static struct kemari_ring *ring = NULL;
+static uint32_t kemari_ring_size = 0;
+static pid_t qemu_pid;
+static int is_finalized = 0;
+static int domid;
+
+/* For HVM guests, there are two sources of dirty pages: the Xen shadow
+ * log-dirty bitmap, which we get with a hypercall, and qemu's version.
+ * The protocol for getting page-dirtying data from qemu uses a
+ * double-buffered shared memory interface directly between xc_save and
+ * qemu-dm.
+ *
+ * xc_save calculates the size of the bitmaps and notifies qemu-dm
+ * through the store that it wants to share the bitmaps.  qemu-dm then
+ * starts filling in the 'active' buffer.
+ *
+ * To change the buffers over, xc_save writes the other buffer number to
+ * the store and waits for qemu to acknowledge that it is now writing to
+ * the new active buffer.  xc_save can then process and clear the old
+ * active buffer. */
+
+static char *qemu_active_path;
+static char *qemu_next_active_path;
+static int qemu_shmid = -1;
+static struct xs_handle *xs;
+
+
+/* Mark the shared-memory segment for destruction */
+static void qemu_destroy_buffer(void)
+{
+    if (qemu_shmid != -1)
+        shmctl(qemu_shmid, IPC_RMID, NULL);
+    qemu_shmid = -1;
+}
+
+static char *kemari_qemu_info = NULL;
+static void qemu_save_image(int next_active)
+{
+    kemari_qemu_info[0] = next_active;
+    kemari_qemu_info[1] = 0;
+    xen_wmb();
+    kill(qemu_pid, SIGUSR1);
+}
+
+static void qemu_end_flip(void)
+{
+    while (kemari_qemu_info[1] == 0)
+        xen_rmb();
+}
+
+static void qemu_end_save(void)
+{
+    while (kemari_qemu_info[2] == 0)
+        xen_rmb();
+}
+
+static void qemu_image_sent(void)
+{
+    /* after QEMU image sent */
+    kemari_qemu_info[2] = 0;
+    xen_wmb();
+}
+
+static void *init_qemu_maps(int domid, unsigned int bitmap_size)
+{
+    key_t key;
+    char key_ascii[17] = {0,};
+    void *seg;
+    char *path, *p;
+
+    /* Make a shared-memory segment */
+    do {
+        key = rand(); /* No security, just a sequence of numbers */
+        qemu_shmid = shmget(key, 2 * bitmap_size + PAGE_SIZE,
+                       IPC_CREAT|IPC_EXCL|S_IRUSR|S_IWUSR);
+        if (qemu_shmid == -1 && errno != EEXIST)
+            errx(1, "can't get shmem to talk to qemu-dm");
+    } while (qemu_shmid == -1);
+
+    /* Remember to tidy up after ourselves */
+    atexit(qemu_destroy_buffer);
+
+    /* Map it into our address space */
+    seg = shmat(qemu_shmid, NULL, 0);
+    if (seg == (void *) -1)
+        errx(1, "can't map shmem to talk to qemu-dm");
+    memset(seg, 0, 2 * bitmap_size + PAGE_SIZE);
+
+    /* Write the size of it into the first 32 bits */
+    *(uint32_t *)seg = bitmap_size;
+
+    /* Tell qemu about it */
+    if ((xs = xs_daemon_open()) == NULL)
+        errx(1, "Couldn't contact xenstore");
+    if (!(path = strdup("/local/domain/0/device-model/")))
+        errx(1, "can't get domain path in store");
+    if (!(path = realloc(path, strlen(path)
+                         + 10
+                         + strlen("/logdirty/next-active") + 1)))
+        errx(1, "no memory for constructing xenstore path");
+    snprintf(path + strlen(path), 11, "%i", domid);
+    strcat(path, "/logdirty/");
+    p = path + strlen(path);
+
+    strcpy(p, "key");
+    snprintf(key_ascii, 17, "%16.16llx", (unsigned long long) key);
+    if (!xs_write(xs, XBT_NULL, path, key_ascii, 16))
+        errx(1, "can't write key (%s) to store path (%s)\n", key_ascii, path);
+
+    /* Watch for qemu's indication of the active buffer, and request it
+     * to start writing to buffer 0 */
+    strcpy(p, "active");
+    if (!xs_watch(xs, path, "qemu-active-buffer"))
+        errx(1, "can't set watch in store (%s)\n", path);
+    if (!(qemu_active_path = strdup(path)))
+        errx(1, "no memory for copying xenstore path");
+
+    strcpy(p, "next-active");
+    if (!(qemu_next_active_path = strdup(path)))
+        errx(1, "no memory for copying xenstore path");
+
+    kemari_qemu_info = seg + 2 * bitmap_size;
+    xen_wmb();
+    qemu_save_image(0);
+
+    free(path);
+    return seg;
+}
+
+static void close_handler(int sig_type)
+{
+    run = 0;
+}
+
+static int handle_event(int domid, unsigned int flags)
+{
+    int ret = 1, rcv_port;
+
+    if ((rcv_port = xc_evtchn_pending(xce_handle)) < 0) {
+        ERROR("Failed to read from event fd");
+        goto out;
+    }
+
+    if (xc_kemari_update(xc_handle, io_fd, domid, ring, flags,
+       qemu_save_image, qemu_end_flip, qemu_end_save, qemu_image_sent) != 0) {
+        xc_domain_pause(xc_handle, domid);
+        kill(qemu_pid, SIGSTOP);
+        ERROR("xc_kemari_update failed");
+        goto out;
+    }
+
+    if (xc_evtchn_unmask(xce_handle, rcv_port) < 0) {
+        ERROR("Failed to write to event fd");
+        goto out;
+    }
+
+    ret = 0;
+out:
+    return ret;
+}
+
+static void set_signal_handler(void (*handler)(int))
+{
+    struct sigaction act;
+
+    act.sa_handler = handler;
+    sigemptyset(&act.sa_mask);
+    act.sa_flags = 0;
+    sigaction(SIGQUIT, &act, 0);
+    sigaction(SIGINT, &act, 0);
+    sigaction(SIGHUP, &act, 0);
+    sigaction(SIGTERM, &act, 0);
+}
+
+static int attach_ports(int domid)
+{
+    struct xs_handle *xs_handle;
+    char **list, *data;
+    unsigned int list_size, data_size;
+    char path[128];
+    uint32_t port;
+    int i, ret = 1;
+
+    if ((xs_handle = xs_daemon_open()) == NULL)
+        errx(1, "Couldn't contact xenstore");
+
+    /*
+     * attach block port.
+     */
+    snprintf(path, sizeof(path), "/local/domain/%d/device/vbd", domid);
+    list = xs_directory(xs_handle, XBT_NULL, path, &list_size);
+    if (list == NULL)
+        errx(1, "xs_directory (%s) failed", path);
+
+    for (i = 0; i < list_size; i++) {
+        snprintf(path, sizeof(path),
+            "/local/domain/%d/device/vbd/%s/event-channel", domid, list[i]);
+        data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+        if (data == NULL)
+            continue;
+        port = strtoul(data, NULL, 10);
+        if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_attach,
+                                 &port, NULL,
+                                 NULL, KEMARI_TAP_OUT)) != 0) {
+            ERROR("Error when attaching blk_port (%d) on kemari", port);
+            goto out;
+        }
+        free(data);
+        DPRINTF("blk_port %d attached\n", port);
+    }
+    free(list);
+
+    /*
+     * attach net port.
+     */
+    snprintf(path, sizeof(path), "/local/domain/%d/device/vif", domid);
+    list = xs_directory(xs_handle, XBT_NULL, path, &list_size);
+    if (list == NULL)
+        errx(1, "xs_directory (%s) failed", path);
+
+    for (i = 0; i < list_size; i++) {
+        snprintf(path, sizeof(path),
+            "/local/domain/%d/device/vif/%s/event-channel", domid, list[i]);
+        data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+        if (data == NULL)
+            continue;
+        port = strtoul(data, NULL, 10);
+        if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_attach,
+                                 &port, NULL,
+                                 NULL, KEMARI_TAP_OUT)) != 0) {
+            ERROR("Error when attaching net_port (%d) on kemari", port);
+            goto out;
+        }
+        free(data);
+        DPRINTF("net_port %d attached\n", port);
+    }
+    free(list);
+
+    /* attach success */
+    ret = 0;
+
+out:
+    xs_daemon_close(xs_handle);
+
+    return ret;
+}
+
+static pid_t get_qemu_pid(int domid)
+{
+    struct xs_handle *xs_handle;
+    char path[128];
+    char *data;
+    unsigned int data_size;
+    pid_t pid = 0;
+
+    if ((xs_handle = xs_daemon_open()) == NULL)
+        errx(1, "Couldn't contact xenstore");
+
+    snprintf(path, sizeof(path),
+        "/local/domain/%d/image/device-model-pid", domid);
+    data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+    if (data == NULL) {
+        ERROR("Could not find QEMU pid for domid %d", domid);
+        goto out;
+    }
+    pid = strtoul(data, NULL, 10);
+    free(data);
+
+out:
+    xs_daemon_close(xs_handle);
+
+    return pid;
+}
+
+static void finalize(void)
+{
+    int ret;
+
+    if (is_finalized)
+        return;
+
+    set_signal_handler(SIG_IGN);
+    if (ring != NULL)
+        munmap(ring, kemari_ring_size * PAGE_SIZE);
+
+    if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_off,
+                            NULL, NULL, NULL, 0)) != 0) {
+        ERROR("Error when turning off kemari");
+    } else {
+        DPRINTF("successufully execute KEMARI_OP_off\n");
+    }
+
+    if ( xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_OFF,
+                           NULL, 0, NULL, 0, NULL) < 0 ) {
+        ERROR("Warning - couldn't disable shadow mode");
+    }
+
+    if (!run)
+        xc_domain_destroy(xc_handle, domid);
+
+    xc_interface_close(xc_handle);
+
+    is_finalized = 1;
+}
+
+int
+main(int argc, char **argv)
+{
+    unsigned int maxit, max_f, flags;
+    int ret;
+    int evtchn_fd;
+    uint32_t port, kemari_port;
+    uint64_t kemari_mfn;
+    fd_set inset;
+
+    if (argc != 6)
+        errx(1, "usage: %s iofd domid maxit maxf flags", argv[0]);
+
+    xc_handle = xc_interface_open();
+    if (xc_handle < 0)
+        errx(1, "failed to open control interface");
+
+    io_fd = atoi(argv[1]);
+    domid = atoi(argv[2]);
+    maxit = atoi(argv[3]);
+    max_f = atoi(argv[4]);
+    flags = atoi(argv[5]);
+
+    set_signal_handler(close_handler);
+    if ((qemu_pid = get_qemu_pid(domid)) == 0)
+        errx(1, "failed to get qemu pid");
+    atexit(finalize);
+
+    if (io_fd == -1) /* means test mode */
+    {
+        io_fd = open("/dev/null", O_RDWR);
+        flags |= XCFLAGS_DEBUG;
+    }
+    else
+    {
+        int one = 1;
+        if (setsockopt(io_fd, IPPROTO_TCP, TCP_NODELAY,
+                       &one, sizeof(one)) < 0) {
+            ERROR("failed to set TCP_NODELAY");
+        }
+    }
+
+    if ((xce_handle = xc_evtchn_open()) < 0) {
+        errx(1, "failed to open control interface");
+    }
+
+    evtchn_fd = xc_evtchn_fd(xce_handle);
+
+    if ( xc_shadow_control(xc_handle, domid,
+                           XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+                           NULL, 0, NULL, 0, NULL) < 0 )
+    {
+       int frc;
+        /* log-dirty already enabled? There's no test op,
+           so attempt to disable then reenable it */
+        frc = xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_OFF,
+                                NULL, 0, NULL, 0, NULL);
+        if ( frc >= 0 )
+        {
+            frc = xc_shadow_control(xc_handle, domid,
+                                    XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+                                    NULL, 0, NULL, 0, NULL);
+        }
+
+        if ( frc < 0 )
+        {
+            err(errno, "Couldn't enable shadow mode (rc %d)", frc);
+        }
+    }
+
+    if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_enable,
+                                 &kemari_port, &kemari_ring_size,
+                                 &kemari_mfn, 0) != 0)) {
+        errx(1, "Error when turning on kemari");
+    }
+
+    DPRINTF("kemari_port=%u, kemari_mfn=%llu, kemari_ring_size=%u\n",
+           kemari_port, kemari_mfn, kemari_ring_size);
+
+    if (attach_ports(domid) != 0) {
+        ERROR("attaching port failed ");
+        goto out;
+    }
+
+    if ((port = xc_evtchn_bind_interdomain(xce_handle, domid,
+                                           kemari_port)) < 0) {
+        ERROR("xc_evtchn_bind_interdomain failed ");
+        goto out;
+    }
+
+    if ((ring = xc_map_foreign_range(xc_handle, DOMID_XEN,
+                                     kemari_ring_size * PAGE_SIZE, PROT_READ | 
PROT_WRITE,
+                                     kemari_mfn)) == 0) {
+        ERROR("xc_map_foreign_range failed");
+        goto out;
+    }
+
+    if (xc_domain_pause(xc_handle, domid) < 0) {
+        ERROR("Domain appears not to have paused");
+        goto out;
+    }
+
+    ret = xc_kemari_save(xc_handle, io_fd, domid, ring, flags,
+                         !!(flags & XCFLAGS_HVM),
+                         &init_qemu_maps);
+    if (ret != 0) {
+        ERROR("xc_kemari_save failed");
+        goto out;
+    }
+
+    FD_ZERO(&inset);
+    FD_SET(evtchn_fd, &inset);
+
+    if (xc_domain_unpause(xc_handle, domid) < 0) {
+        ERROR("Domain appears not to have unpaused");
+        goto out;
+    }
+
+    DPRINTF("running start");
+
+    while (run) {
+
+        if (select(evtchn_fd + 1, &inset, NULL, NULL, NULL) < 0) {
+            if (errno == EINTR)
+                continue;
+            ERROR("Error when waiting events by select()");
+            break;
+        }
+
+        if (evtchn_fd != -1 && FD_ISSET(evtchn_fd, &inset)) {
+
+            if ((ret = handle_event(domid, flags)) != 0) {
+                ERROR("Error when handling events");
+                break;
+            }
+
+            /* usleep(10000); */
+
+            if (xc_evtchn_notify(xce_handle, port) < 0) {
+                ERROR("xc_evtchn_notify failed");
+                /* goto out; */
+                break;
+            }
+
+            if(xc_domain_unpause(xc_handle, domid) < 0) {
+                ERROR("xc_domain_unpause");
+                /* goto out; */
+                break;
+            }
+
+        }
+    }
+
+ out:
+    close(io_fd);
+    finalize();
+
+    return ret;
+}
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+




_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.