[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [RFC Patch v4 16/18] setup and control colo-agent for primary vm



This patch adds the machinery required for protecting a primary vm's
network device state. This patch comprises of three parts:

1. Hotplug scripts: The colo-agent-setup script is responsible for
   setting up and tearing down the necessary infrastructure required
   for COLO agent.  This script should be invoked by libxl for each
   of the guest's network interfaces, when starting or stopping COLO.

   COLO agent is a agent that will compare the packets from primary
   vm and secondary vm, and trigger a new checkpoint if they are
   different.

   Apart from returning success/failure indication via the usual hotplug
   entries in xenstore, this script also writes to xenstore, the name of
   the ifb device to be used to control COLO agent.

   The script relies on libnl3 command line utilities to perform various
   setup/teardown functions. The script is confined to Linux platforms only
   since NetBSD does not seem to have libnl3.

2. Checkpoint network device: Implements the interfaces required by the
   checkpoint abstract device layer. A note about the implementation:
   a) setup() and teardown() are called for each vif attached to the
      primary vm.
      During setup(), the hotplug script is called to setup COLO agent for
      given vif. The script does the follow things:
      i)   choose two available IFB devices from system(called IFB_PRIMARY,
           and IFB_SECONDARY), and set up the colo qdisc on these two ifb
           devices.
      ii)  copy and forward the egress traffic to the FORWARD device
      iii) redirect vif egress traffic to the IFB_PRIMARY device
      iv)  redirect FORWARD device egress traffic to the IFB_SECONDARY device

      During teardown(), the hotplug scripts are called again for each
      vif. The scripts does the follow things:
      i)   remove the vif->IFB_PRIMARY traffic redirection
      ii)  remove the FORWARD->IFB_SECONDARY traffic redirection
      iii) remove the vif->FORWARD traffic forwarding.
      iv)  release the two ifb devices and the colo qdisc associated with
           them

   b) The checkpoint callbacks are not for each vif. So we implement it in
      libxl_colo_save.c

3. colo-tc: a simple command like tc, and just for the qdisc colo.

Signed-off-by: Wen Congyang <wency@xxxxxxxxxxxxxx>
---
 .gitignore                             |   1 +
 tools/hotplug/Linux/Makefile           |   2 +
 tools/hotplug/Linux/colo-agent-setup   | 210 ++++++++++++
 tools/hotplug/Linux/remus-netbuf-setup |  45 +--
 tools/hotplug/Linux/xen-network-ft.sh  | 102 ++++++
 tools/libxl/Makefile                   |   8 +-
 tools/libxl/colo-tc.c                  | 589 +++++++++++++++++++++++++++++++++
 tools/libxl/libxl.c                    |   6 +
 tools/libxl/libxl_colo_nic.c           | 289 ++++++++++++++++
 tools/libxl/libxl_colo_save.c          | 177 +++++++++-
 tools/libxl/libxl_internal.h           |   8 +
 tools/libxl/libxl_types.idl            |   1 +
 tools/libxl/xl_cmdimpl.c               |   3 +
 13 files changed, 1390 insertions(+), 51 deletions(-)
 create mode 100755 tools/hotplug/Linux/colo-agent-setup
 create mode 100644 tools/hotplug/Linux/xen-network-ft.sh
 create mode 100644 tools/libxl/colo-tc.c
 create mode 100644 tools/libxl/libxl_colo_nic.c

diff --git a/.gitignore b/.gitignore
index b24e905..137de89 100644
--- a/.gitignore
+++ b/.gitignore
@@ -286,6 +286,7 @@ tools/libxl/*.pyc
 tools/libxl/libxl-save-helper
 tools/libxl/test_timedereg
 tools/libxl/xen-init-dom0
+tools/libxl/colo-tc
 tools/blktap2/control/tap-ctl
 tools/firmware/etherboot/eb-roms.h
 tools/firmware/etherboot/gpxe-git-snapshot.tar.gz
diff --git a/tools/hotplug/Linux/Makefile b/tools/hotplug/Linux/Makefile
index 1706c05..d8bd6d4 100644
--- a/tools/hotplug/Linux/Makefile
+++ b/tools/hotplug/Linux/Makefile
@@ -26,12 +26,14 @@ XEN_SCRIPTS += vscsi
 XEN_SCRIPTS += block-iscsi
 XEN_SCRIPTS += block-drbd-probe
 XEN_SCRIPTS += $(XEN_SCRIPTS-y)
+XEN_SCRIPTS += colo-agent-setup
 
 SUBDIRS-$(CONFIG_SYSTEMD) += systemd
 
 XEN_SCRIPT_DATA = xen-script-common.sh locking.sh logging.sh
 XEN_SCRIPT_DATA += xen-hotplug-common.sh xen-network-common.sh vif-common.sh
 XEN_SCRIPT_DATA += block-common.sh
+XEN_SCRIPT_DATA += xen-network-ft.sh
 
 UDEV_RULES_DIR = $(CONFIG_DIR)/udev
 UDEV_RULES = xen-backend.rules $(UDEV_RULES-y)
diff --git a/tools/hotplug/Linux/colo-agent-setup 
b/tools/hotplug/Linux/colo-agent-setup
new file mode 100755
index 0000000..8cdc41a
--- /dev/null
+++ b/tools/hotplug/Linux/colo-agent-setup
@@ -0,0 +1,210 @@
+#! /bin/bash
+
+dir=$(dirname "$0")
+. "$dir/xen-hotplug-common.sh"
+. "$dir/hotplugpath.sh"
+. "$dir/xen-network-ft.sh"
+
+findCommand "$@"
+
+if [ "$command" != "setup" -a  "$command" != "teardown" ]
+then
+    echo "Invalid command: $command"
+    log err "Invalid command: $command"
+    exit 1
+fi
+
+evalVariables "$@"
+
+: ${vifname:?}
+: ${XENBUS_PATH:?}
+: ${forwarddev:?}
+: ${mode:?}
+: ${vmid:?}
+
+if [ "$mode" != "primary" -a "$mode" != "secondary" ]
+then
+    echo "Invalid mode: $mode"
+    log err "Invalid mode: $mode"
+    exit 1
+fi
+
+# redirect input packets from src_nic to dst_nic
+function redirect_nic_traffic()
+{
+    local src_nic=$1
+    local dst_nic=$2
+
+    if ! tc qdisc add dev $src_nic ingress > /dev/null 2>&1
+    then
+        fatal "Unable to add ingress qdisc to nic $src_nic"
+    fi
+
+    if ! tc filter add dev $src_nic parent ffff: protocol ip prio 10 \
+        u32 match u32 0 0 flowid 1:2 action mirred egress redirect dev \
+        $dst_nic > /dev/null 2>&1
+    then
+        fatal "Unable to redirect ip packets from $src_nic to $dst_nic"
+    fi
+
+    if ! tc filter add dev $src_nic parent ffff: protocol arp prio 11 \
+        u32 match u32 0 0 flowid 1:2 action mirred egress redirect dev \
+        $dst_nic > /dev/null 2>&1
+    then
+        fatal "Unable to redirect arp packets from $src_nic to $dst_nic"
+    fi
+}
+
+function stop_redirect_nic_traffic()
+{
+    local src_nic=$1
+
+    do_without_error tc filter del dev $src_nic parent ffff: protocol ip prio 
10 u32
+    do_without_error tc filter del dev $src_nic parent ffff: protocol arp prio 
11 u32
+    do_without_error tc qdisc del dev $src_nic ingress
+}
+
+# copy and forward input packets from src_nic to dst_nic
+function copy_and_forward_nic_traffic()
+{
+    local src_nic=$1
+    local dst_nic=$2
+
+    if ! tc qdisc add dev $src_nic root handle 1: prio > /dev/null 2>&1
+    then
+        fatal "Unable to add qdic prio to nic $src_nic"
+    fi
+
+    if ! tc filter add dev $src_nic parent 1: protocol ip prio 10 \
+        u32 match u32 0 0 flowid 1:2 action mirred egress mirror dev \
+        $dst_nic > /dev/null 2>&1
+    then
+        fatal "Unable to copy and forward ip packets from $src_nic to $dst_nic"
+    fi
+
+    if ! tc filter add dev $src_nic parent 1: protocol arp prio 11 \
+        u32 match u32 0 0 flowid 1:2 action mirred egress mirror dev \
+        $dst_nic > /dev/null 2>&1
+    then
+        fatal "Unable to copy and forward arp packets from $src_nic to 
$dst_nic"
+    fi
+}
+
+function stop_copy_and_forward_nic_traffic()
+{
+    local src_nic=$1
+
+    do_without_error tc filter del dev $src_nic parent 1: protocol ip prio 10 
u32
+    do_without_error tc filter del dev $src_nic parent 1: protocol arp prio 11 
u32
+    do_without_error tc qdisc del dev $src_nic root handle 1: prio
+}
+
+function teardown_ifb()
+{
+    local ifb=$1
+
+    if [ -z "$ifb" ]
+    then
+        return
+    fi
+
+    do_without_error ip link set dev "$ifb" down
+    do_without_error tc qdisc del dev "$ifb" root handle 1: colo
+}
+
+function setup_primary()
+{
+    $LIBEXEC_BIN/colo-tc qdisc add dev $IFB_PRIMARY root handle 1: colo \
+        dev $IFB_SECONDARY primary vmid $vmid
+    $LIBEXEC_BIN/colo-tc qdisc add dev $IFB_SECONDARY root handle 1: colo \
+        dev $IFB_PRIMARY secondary vmid $vmid
+
+    redirect_nic_traffic $forwarddev $IFB_SECONDARY
+    copy_and_forward_nic_traffic $vifname $forwarddev
+    redirect_nic_traffic $vifname $IFB_PRIMARY
+    if ! ifconfig $forwarddev promisc
+    then
+        fatal "device $forwarddev cannot enter promiscuous mode"
+    fi
+}
+
+function teardown_primary()
+{
+    local ifb=
+
+    if xenstore-exits "$XENBUS_PATH/ifb_primary"
+    then
+        ifb=`xenstore-read "$XENBUS_PATH/ifb_primary" 2>/dev/null || true`
+        IFB_PRIMARY=$ifb
+        teardown_ifb "$ifb"
+    fi
+
+    if xenstore-exits "$XENBUS_PATH/ifb_secondary"
+    then
+        ifb=`xenstore-read "$XENBUS_PATH/ifb_secondary" 2>/dev/null || true`
+        IFB_SECONDARY=$ifb
+        teardown_ifb "$ifb"
+    fi
+
+    stop_redirect_nic_traffic $forwarddev
+    stop_redirect_nic_traffic $vifname
+    stop_copy_and_forward_nic_traffic $vifname
+
+    do_without_error ifconfig $forwarddev -promisc
+}
+
+function setup_secondary()
+{
+    redirect_nic_traffic $forwarddev $vifname
+    redirect_nic_traffic $vifname $forwarddev
+
+    if ! ifconfig $forwarddev promisc
+    then
+        fatal "device $forwarddev cannot enter promiscuous mode"
+    fi
+}
+
+function teardown_secondary()
+{
+    stop_redirect_nic_traffic $vifname
+    stop_redirect_nic_traffic $forwarddev
+
+    do_without_error ifconfig $forwarddev -promisc
+}
+
+case "$command" in
+    setup)
+        if [ "$mode" = "primary" ]
+        then
+            claim_lock "pickifb"
+            setup_ifb $vifname ifb_primary
+            IFB_PRIMARY=$ifb
+            setup_ifb $vifname ifb_secondary
+            IFB_SECONDARY=$ifb
+            setup_primary
+            release_lock "pickifb"
+        else
+            setup_secondary
+        fi
+
+        success
+        ;;
+    teardown)
+        if [ "$mode" = "primary" ]
+        then
+            teardown_primary
+        else
+            teardown_secondary
+        fi
+        ;;
+esac
+
+if [ "$mode" = "primary" ]
+then
+    log debug "Successful colo-agent-setup $command for $vifname." \
+              " ifb_primary: $IFB_PRIMARY, ifb_secondary: $IFB_SECONDARY, " \
+              "forwarddev: $forwarddev."
+else
+    log debug "Successful colo-agent-setup $command for $vifname."\
+              " forwarddev $forwarddev."
+fi
diff --git a/tools/hotplug/Linux/remus-netbuf-setup 
b/tools/hotplug/Linux/remus-netbuf-setup
index 87dfa69..9391d1e 100644
--- a/tools/hotplug/Linux/remus-netbuf-setup
+++ b/tools/hotplug/Linux/remus-netbuf-setup
@@ -76,6 +76,7 @@
 #specific setup code such as renaming.
 dir=$(dirname "$0")
 . "$dir/xen-hotplug-common.sh"
+. "$dir/xen-network-ft.sh"
 
 findCommand "$@"
 
@@ -116,47 +117,6 @@ check_modules() {
     done
 }
 
-#return 0 if the ifb is free
-check_ifb() {
-    local installed=`nl-qdisc-list -d $1`
-    [ -n "$installed" ] && return 1
-
-    for domid in `xenstore-list "/local/domain" 2>/dev/null || true`
-    do
-        [ $domid -eq 0 ] && continue
-        xenstore-exists "/libxl/$domid/remus/netbuf" || continue
-        for devid in `xenstore-list "/libxl/$domid/remus/netbuf" 2>/dev/null 
|| true`
-        do
-            local path="/libxl/$domid/remus/netbuf/$devid/ifb"
-            xenstore-exists $path || continue
-            local ifb=`xenstore-read "$path" 2>/dev/null || true`
-            [ "$ifb" = "$1" ] && return 1
-        done
-    done
-
-    return 0
-}
-
-setup_ifb() {
-
-    for ifb in `ifconfig -a -s|egrep ^ifb|cut -d ' ' -f1`
-    do
-        check_ifb "$ifb" || continue
-        REMUS_IFB="$ifb"
-        break
-    done
-
-    if [ -z "$REMUS_IFB" ]
-    then
-        fatal "Unable to find a free ifb device for $vifname"
-    fi
-
-    #not using xenstore_write that automatically exits on error
-    #because we need to cleanup
-    xenstore_write "$XENBUS_PATH/ifb" "$REMUS_IFB"
-    do_or_die ip link set dev "$REMUS_IFB" up
-}
-
 redirect_vif_traffic() {
     local vif=$1
     local ifb=$2
@@ -215,7 +175,8 @@ case "$command" in
         check_modules
 
         claim_lock "pickifb"
-        setup_ifb
+        setup_ifb $vifname ifb
+        REMUS_IFB=$ifb
         redirect_vif_traffic "$vifname" "$REMUS_IFB"
         add_plug_qdisc "$vifname" "$REMUS_IFB"
         release_lock "pickifb"
diff --git a/tools/hotplug/Linux/xen-network-ft.sh 
b/tools/hotplug/Linux/xen-network-ft.sh
new file mode 100644
index 0000000..9c642e4
--- /dev/null
+++ b/tools/hotplug/Linux/xen-network-ft.sh
@@ -0,0 +1,102 @@
+#
+# Copyright (C) 2014 FUJITSU LIMITED
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+# ifb is stored in /libxl/<domid>/<path>/<devid>/<file>
+PATH_LIST=(
+    remus/netbuf
+    colo_agent
+    colo_agent
+)
+
+FILE_LIST=(
+    ifb
+    ifb_primary
+    ifb_secondary
+)
+
+# check_one_ifb_file $domid $path $file
+function check_one_ifb_file()
+{
+    local domid=$1
+    local path=$2
+    local file=$3
+    local full_path=
+    local ifb=
+
+    for devid in `xenstore-list "/libxl/$domid/$path" 2>/dev/null || true`
+    do
+        full_path="/libxl/$domid/$path/$devid/$file"
+        xenstore-exists $full_path || continue
+        ifb=`xenstore-read "$full_path" 2>/dev/null || true`
+        [ "$ifb" = "$1" ] && return 1
+    done
+
+    return 0
+}
+
+# return 0 if the ifb is free
+function check_ifb()
+{
+    local installed=`nl-qdisc-list -d $1`
+    local path=
+    local file=
+    local -i index=0
+
+    [ -n "$installed" ] && return 1
+
+    for domid in `xenstore-list "/local/domain" 2>/dev/null || true`
+    do
+        [ $domid -eq 0 ] && continue
+
+        index=0
+        for path in "${PATH_LIST[@]}"; do
+            index=$((index + 1))
+            xenstore-exists "/libxl/$domid/$path" || continue
+            file=${FILE_LIST[index]}
+
+            check_one_ifb_file $domid $path $file || return 1
+        done
+    done
+
+    return 0
+}
+
+# setup_ifb $nic_name $file_name
+# Note:
+#   1. The caller should acquire the lock pickifb
+#   2. ifb name will be stored in $XENBUS_PATH/$file_name
+function setup_ifb()
+{
+    local nic_name=$1
+    local file_name=$2
+    local found=0
+
+    for ifb in `ifconfig -a -s|egrep ^ifb|cut -d ' ' -f1`
+    do
+        check_ifb "$ifb" || continue
+        found=1
+        break
+    done
+
+    if [ $found -eq 0 ]
+    then
+        fatal "Unable to find a free ifb device for $nic_name"
+    fi
+
+    xenstore_write "$XENBUS_PATH/$file_name" "$ifb"
+    do_or_die ip link set dev "$ifb" up
+}
diff --git a/tools/libxl/Makefile b/tools/libxl/Makefile
index ed46af3..1d2d8d9 100644
--- a/tools/libxl/Makefile
+++ b/tools/libxl/Makefile
@@ -57,7 +57,7 @@ LIBXL_OBJS-y += libxl_nonetbuffer.o
 endif
 
 LIBXL_OBJS-y += libxl_remus.o libxl_checkpoint_device.o libxl_remus_disk_drbd.o
-LIBXL_OBJS-y += libxl_colo_restore.o libxl_colo_save.o
+LIBXL_OBJS-y += libxl_colo_restore.o libxl_colo_save.o libxl_colo_nic.o
 
 LIBXL_OBJS-$(CONFIG_X86) += libxl_cpuid.o libxl_x86.o libxl_psr.o
 LIBXL_OBJS-$(CONFIG_ARM) += libxl_nocpuid.o libxl_arm.o
@@ -126,7 +126,7 @@ LIBXLU_OBJS = libxlu_cfg_y.o libxlu_cfg_l.o libxlu_cfg.o \
        libxlu_disk_l.o libxlu_disk.o libxlu_vif.o libxlu_pci.o
 $(LIBXLU_OBJS): CFLAGS += $(CFLAGS_libxenctrl) # For xentoollog.h
 
-CLIENTS = xl testidl libxl-save-helper xen-init-dom0
+CLIENTS = xl testidl libxl-save-helper xen-init-dom0 colo-tc
 
 CFLAGS_XL += $(CFLAGS_libxenlight)
 CFLAGS_XL += -Wshadow
@@ -249,6 +249,9 @@ libxl-save-helper: $(SAVE_HELPER_OBJS) libxenlight.so
 testidl: testidl.o libxlutil.so libxenlight.so
        $(CC) $(LDFLAGS) -o $@ testidl.o libxlutil.so $(LDLIBS_libxenlight) 
$(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
 
+colo-tc: colo-tc.o
+       $(CC) $(LDFLAGS) -o $@ colo-tc.o
+
 .PHONY: install
 install: all
        $(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
@@ -259,6 +262,7 @@ install: all
        $(INSTALL_PROG) xl $(DESTDIR)$(SBINDIR)
        $(INSTALL_PROG) xen-init-dom0 $(DESTDIR)$(LIBEXEC_BIN)
        $(INSTALL_PROG) libxl-save-helper $(DESTDIR)$(LIBEXEC_BIN)
+       $(INSTALL_PROG) colo-tc $(DESTDIR)$(LIBEXEC_BIN)
        $(INSTALL_SHLIB) libxenlight.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)
        $(SYMLINK_SHLIB) libxenlight.so.$(MAJOR).$(MINOR) 
$(DESTDIR)$(LIBDIR)/libxenlight.so.$(MAJOR)
        $(SYMLINK_SHLIB) libxenlight.so.$(MAJOR) 
$(DESTDIR)$(LIBDIR)/libxenlight.so
diff --git a/tools/libxl/colo-tc.c b/tools/libxl/colo-tc.c
new file mode 100644
index 0000000..76093db
--- /dev/null
+++ b/tools/libxl/colo-tc.c
@@ -0,0 +1,589 @@
+/*
+ * Copyright (C) 2014 FUJITSU LIMITED
+ * Author: Wen Congyang <wency@xxxxxxxxxxxxxx>
+ *
+ * Almost all codes are copied from iproute.
+ *
+ * colo-agent introduces a new qdisc colo, and needs some parameter.
+ * tc only supports new qdisc without parameter, so we introduce
+ * a new simple command to support this new qdisc.
+ *
+ * The licenses of iproute is GPLv2 or later.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/socket.h>
+#include <linux/socket.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include <net/if.h>
+#include <errno.h>
+#include <unistd.h>
+#include <time.h>
+
+#define TCA_BUF_MAX (64*1024)
+#define NEXT_ARG()                                          \
+    do {                                                    \
+        argv++;                                             \
+        if (--argc <= 0) {                                  \
+            fprintf(stderr, "Command line is not complete." \
+                    " Try option \"help\"\n");              \
+            return -1;                                      \
+        }                                                   \
+    } while(0)
+
+enum {
+    TCA_COLO_UNSPEC,
+    TCA_COLO_DEV_IDX,
+    TCA_COLO_FLAGS,
+    TCA_COLO_VM_IDX,
+    __TCA_COLO_MAX,
+};
+
+struct colo_idx {
+    uint32_t this_idx;
+    uint32_t other_idx;
+};
+
+/* flags */
+#define IS_PRIMARY   (1 << 0)
+
+
+struct rtnl_handle
+{
+    int         fd;
+    struct sockaddr_nl  local;
+    struct sockaddr_nl  peer;
+    __u32           seq;
+};
+
+#define NLMSG_TAIL(nmsg) \
+    ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
+
+static int rtnl_open(struct rtnl_handle *rth, unsigned subscriptions)
+{
+    socklen_t addr_len;
+    int sndbuf = 32768;
+    int rcvbuf = 1024 * 1024;
+
+    memset(rth, 0, sizeof(*rth));
+
+    rth->fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+    if (rth->fd < 0) {
+        perror("Cannot open netlink socket");
+        return -1;
+    }
+
+    if (setsockopt(rth->fd, SOL_SOCKET, SO_SNDBUF, &sndbuf,
+                   sizeof(sndbuf)) < 0) {
+        perror("SO_SNDBUF");
+        return -1;
+    }
+
+    if (setsockopt(rth->fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf,
+                   sizeof(rcvbuf)) < 0) {
+        perror("SO_RCVBUF");
+        return -1;
+    }
+
+    memset(&rth->local, 0, sizeof(rth->local));
+    rth->local.nl_family = AF_NETLINK;
+    rth->local.nl_groups = subscriptions;
+    if (bind(rth->fd, (struct sockaddr*)&rth->local, sizeof(rth->local)) < 0) {
+        perror("Cannot bind netlink socket");
+        return -1;
+    }
+
+    addr_len = sizeof(rth->local);
+    if (getsockname(rth->fd, (struct sockaddr*)&rth->local, &addr_len) < 0) {
+        perror("Cannot getsockname");
+        return -1;
+    }
+    if (addr_len != sizeof(rth->local)) {
+        fprintf(stderr, "Wrong address length %d\n", addr_len);
+        return -1;
+    }
+    if (rth->local.nl_family != AF_NETLINK) {
+        fprintf(stderr, "Wrong address family %d\n", rth->local.nl_family);
+        return -1;
+    }
+
+    rth->seq = time(NULL);
+    return 0;
+}
+
+static int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, pid_t peer,
+              unsigned groups, struct nlmsghdr *answer)
+{
+    int status;
+    unsigned seq;
+    struct nlmsghdr *h;
+    struct sockaddr_nl nladdr;
+    struct iovec iov = {
+        .iov_base = (void*) n,
+        .iov_len = n->nlmsg_len
+    };
+    struct msghdr msg = {
+        .msg_name = &nladdr,
+        .msg_namelen = sizeof(nladdr),
+        .msg_iov = &iov,
+        .msg_iovlen = 1,
+    };
+    char   buf[16384];
+
+    memset(&nladdr, 0, sizeof(nladdr));
+    nladdr.nl_family = AF_NETLINK;
+    nladdr.nl_pid = peer;
+    nladdr.nl_groups = groups;
+
+    n->nlmsg_seq = seq = ++rtnl->seq;
+
+    if (answer == NULL)
+        n->nlmsg_flags |= NLM_F_ACK;
+
+    status = sendmsg(rtnl->fd, &msg, 0);
+
+    if (status < 0) {
+        perror("Cannot talk to rtnetlink");
+        return -1;
+    }
+
+    memset(buf,0,sizeof(buf));
+
+    iov.iov_base = buf;
+
+    while (1) {
+        iov.iov_len = sizeof(buf);
+        status = recvmsg(rtnl->fd, &msg, 0);
+
+        if (status < 0) {
+            if (errno == EINTR || errno == EAGAIN)
+                continue;
+            fprintf(stderr, "netlink receive error %s (%d)\n",
+                strerror(errno), errno);
+            return -1;
+        }
+        if (status == 0) {
+            fprintf(stderr, "EOF on netlink\n");
+            return -1;
+        }
+        if (msg.msg_namelen != sizeof(nladdr)) {
+            fprintf(stderr, "sender address length == %d\n", msg.msg_namelen);
+            exit(1);
+        }
+        for (h = (struct nlmsghdr*)buf; status >= sizeof(*h); ) {
+            int len = h->nlmsg_len;
+            int l = len - sizeof(*h);
+
+            if (l < 0 || len>status) {
+                if (msg.msg_flags & MSG_TRUNC) {
+                    fprintf(stderr, "Truncated message\n");
+                    return -1;
+                }
+                fprintf(stderr, "!!!malformed message: len=%d\n", len);
+                exit(1);
+            }
+
+            if (nladdr.nl_pid != peer ||
+                h->nlmsg_pid != rtnl->local.nl_pid ||
+                h->nlmsg_seq != seq) {
+                /* Don't forget to skip that message. */
+                status -= NLMSG_ALIGN(len);
+                h = (struct nlmsghdr*)((char*)h + NLMSG_ALIGN(len));
+                continue;
+            }
+
+            if (h->nlmsg_type == NLMSG_ERROR) {
+                struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(h);
+                if (l < sizeof(struct nlmsgerr)) {
+                    fprintf(stderr, "ERROR truncated\n");
+                } else {
+                    if (!err->error) {
+                        if (answer)
+                            memcpy(answer, h, h->nlmsg_len);
+                        return 0;
+                    }
+
+                    fprintf(stderr, "RTNETLINK answers: %s\n", 
strerror(-err->error));
+                    errno = -err->error;
+                }
+                return -1;
+            }
+            if (answer) {
+                memcpy(answer, h, h->nlmsg_len);
+                return 0;
+            }
+
+            fprintf(stderr, "Unexpected reply!!!\n");
+
+            status -= NLMSG_ALIGN(len);
+            h = (struct nlmsghdr*)((char*)h + NLMSG_ALIGN(len));
+        }
+        if (msg.msg_flags & MSG_TRUNC) {
+            fprintf(stderr, "Message truncated\n");
+            continue;
+        }
+        if (status) {
+            fprintf(stderr, "!!!Remnant of size %d\n", status);
+            exit(1);
+        }
+    }
+}
+
+static void rtnl_close(struct rtnl_handle *rth)
+{
+    if (rth->fd >= 0) {
+        close(rth->fd);
+        rth->fd = -1;
+    }
+}
+
+static int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void 
*data,
+              int alen)
+{
+    int len = RTA_LENGTH(alen);
+    struct rtattr *rta;
+
+    if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen) {
+        fprintf(stderr, "addattr_l ERROR: message exceeded bound of %d\n",
+                maxlen);
+        return -1;
+    }
+    rta = NLMSG_TAIL(n);
+    rta->rta_type = type;
+    rta->rta_len = len;
+    memcpy(RTA_DATA(rta), data, alen);
+    n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len);
+    return 0;
+}
+
+static void duparg(const char *key, const char *arg)
+{
+    fprintf(stderr, "Error: duplicate \"%s\": \"%s\" is the second value.\n",
+            key, arg);
+    exit(1);
+}
+
+static void invarg(const char *msg, const char *arg)
+{
+    fprintf(stderr, "Error: argument \"%s\" is wrong: %s\n", arg, msg);
+    exit(1);
+}
+
+static int usage(void)
+{
+    fprintf(stderr, "Usage: tc qdisc [ add | del | replace | change ] dev 
STRING\n");
+    fprintf(stderr, "       [ handle QHANDLE ] [ root | parent CLASSID ]\n");
+    fprintf(stderr, "       QDISC_KIND [ dev STRING ]\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "Where:\n");
+    fprintf(stderr, "QDISC_KIND := { primary | secondary. }\n");
+    return -1;
+}
+
+struct rtnl_handle rth;
+
+static int get_qdisc_handle(__u32 *h, const char *str)
+{
+    __u32 maj;
+    char *p;
+
+    maj = TC_H_UNSPEC;
+    if (strcmp(str, "none") == 0)
+        goto ok;
+    maj = strtoul(str, &p, 16);
+    if (p == str)
+        return -1;
+    maj <<= 16;
+    if (*p != ':' && *p!=0)
+        return -1;
+ok:
+    *h = maj;
+    return 0;
+}
+
+static int get_tc_classid(__u32 *h, const char *str)
+{
+    __u32 maj, min;
+    char *p;
+
+    maj = TC_H_ROOT;
+    if (strcmp(str, "root") == 0)
+        goto ok;
+    maj = TC_H_UNSPEC;
+    if (strcmp(str, "none") == 0)
+        goto ok;
+    maj = strtoul(str, &p, 16);
+    if (p == str) {
+        maj = 0;
+        if (*p != ':')
+            return -1;
+    }
+    if (*p == ':') {
+        if (maj >= (1<<16))
+            return -1;
+        maj <<= 16;
+        str = p+1;
+        min = strtoul(str, &p, 16);
+        if (*p != 0)
+            return -1;
+        if (min >= (1<<16))
+            return -1;
+        maj |= min;
+    } else if (*p != 0)
+        return -1;
+
+ok:
+    *h = maj;
+    return 0;
+}
+
+static uint32_t get_idx(const char *name)
+{
+    uint32_t idx;
+
+    idx = if_nametoindex(name);
+    if (!idx)
+        fprintf(stderr, "Cannot find device \"%s\"\n", name);
+
+    return idx;
+}
+
+static int parse_opt(int argc, char **argv, struct nlmsghdr *n, int cmd, int 
this_idx)
+{
+    struct colo_idx idx;
+    struct rtattr *tail;
+    int is_primary, is_secondary;
+    uint32_t flags = 0;
+    uint32_t vmidx = 0;
+    char *p;
+
+    if (cmd != RTM_NEWQDISC)
+        return 0;
+
+    is_primary = 0;
+    is_secondary = 0;
+    memset(&idx, 0, sizeof(idx));
+
+    while (argc > 0) {
+        if (strcmp(*argv, "dev") ==0) {
+            NEXT_ARG();
+            if (idx.other_idx)
+                duparg(*argv, "dev");
+
+            idx.other_idx = get_idx(*argv);
+            if (!idx.other_idx)
+                return -1;
+
+            idx.this_idx = this_idx;
+            if (idx.this_idx == idx.other_idx) {
+                fprintf(stderr, "Cannot use the same device\n");
+                return -1;
+            }
+        } else if (strcmp(*argv, "primary") == 0) {
+            if (is_secondary) {
+                fprintf(stderr, "\"primary\" conflicts with \"secondary\"\n");
+                return -1;
+            }
+
+            is_primary = 1;
+        } else if (strcmp(*argv, "secondary") == 0) {
+            if (is_secondary) {
+                fprintf(stderr, "\"secondary\" conflicts with \"primary\"\n");
+                return -1;
+            }
+
+            is_secondary = 1;
+        } else if (strcmp(*argv, "vmid") == 0) {
+            NEXT_ARG();
+            if (vmidx)
+                duparg(*argv, "vmid");
+
+            vmidx = strtoul(*argv, &p, 10);
+            if (*p != '\0' || !vmidx) {
+                fprintf(stderr, "invalid vmid value %s\n", *argv);
+                return -1;
+            }
+        } else {
+            fprintf(stderr, "unsupported option \"%s\"\n", *argv);
+            return -1;
+        }
+        argc--;
+        argv++;
+    }
+
+    if (!idx.other_idx) {
+        fprintf(stderr, "missing option dev\n");
+        return -1;
+    }
+
+    if (!is_primary && !is_secondary) {
+        fprintf(stderr, "missing option primary or secondary\n");
+        return -1;
+    }
+
+    if (!vmidx) {
+        fprintf(stderr, "missing option vmidx\n");
+        return -1;
+    }
+
+    if (is_primary)
+        flags |= IS_PRIMARY;
+
+    tail = NLMSG_TAIL(n);
+    addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+    addattr_l(n, 1024, TCA_COLO_DEV_IDX, &idx, sizeof(idx));
+    addattr_l(n, 1024, TCA_COLO_FLAGS, &flags, sizeof(flags));
+    addattr_l(n, 1024, TCA_COLO_VM_IDX, &vmidx, sizeof(vmidx));
+    tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
+    return 0;
+}
+
+static int tc_qdisc_modify(int cmd, unsigned flags, int argc, char **argv)
+{
+    struct {
+        struct nlmsghdr n;
+        struct tcmsg t;
+        char buff[TCA_BUF_MAX];
+    } req;
+    char k[16];
+    uint32_t handle = 0, idx = 0;
+
+    memset(&req, 0, sizeof(req));
+    memset(k, 0, sizeof(k));
+
+    req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+    req.n.nlmsg_flags = NLM_F_REQUEST|flags;
+    req.n.nlmsg_type = cmd;
+    req.t.tcm_family = AF_UNSPEC;
+
+    while (argc > 0) {
+        if (strcmp(*argv, "dev") == 0) {
+            NEXT_ARG();
+            if (req.t.tcm_ifindex)
+                duparg("dev", *argv);
+
+            idx = get_idx(*argv);
+            if (!idx)
+                return -1;
+            req.t.tcm_ifindex = idx;
+        } else if (strcmp(*argv, "handle") == 0) {
+            NEXT_ARG();
+            if (req.t.tcm_handle)
+                duparg("handle", *argv);
+            if (get_qdisc_handle(&handle, *argv))
+                invarg(*argv, "invalid qdisc ID");
+            req.t.tcm_handle = handle;
+        } else if (strcmp(*argv, "root") == 0) {
+            if (req.t.tcm_parent) {
+                fprintf(stderr, "Error: \"root\" is duplicate parent ID\n");
+                return -1;
+            }
+            req.t.tcm_parent = TC_H_ROOT;
+        } else if (strcmp(*argv, "parent") == 0) {
+            NEXT_ARG();
+            if (req.t.tcm_parent)
+                duparg("parent", *argv);
+            if (get_tc_classid(&handle, *argv))
+                invarg(*argv, "invalid parent ID");
+            req.t.tcm_parent = handle;
+        } else if (strcmp(*argv, "colo") == 0) {
+            strncpy(k, *argv, sizeof(k) - 1);
+            argc--;
+            argv++;
+            break;
+        } else if (strcmp(*argv, "help") == 0){
+            usage();
+            return 0;
+        } else {
+            fprintf(stderr, "unsupported qdisc %s\n", *argv);
+            return -1;
+        }
+        argc--;
+        argv++;
+    }
+
+    if (!k[0]) {
+        fprintf(stderr, "no qdisc is specified\n");
+        return -1;
+    }
+
+    addattr_l(&req.n, sizeof(req), TCA_KIND, k, strlen(k)+1);
+    if (parse_opt(argc, argv, &req.n, cmd, idx))
+        return -1;
+
+    if (rtnl_talk(&rth, &req.n, 0, 0, NULL) < 0)
+        return -1;
+
+    return 0;
+}
+
+static int matches(const char *cmd, const char *pattern)
+{
+    int len = strlen(cmd);
+    if (len > strlen(pattern))
+        return -1;
+    return memcmp(pattern, cmd, len);
+}
+
+static int do_qdisc(int argc, char *argv[])
+{
+    if (matches(*argv, "add") == 0)
+        return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_EXCL|NLM_F_CREATE, argc-1, 
argv+1);
+    if (matches(*argv, "change") == 0)
+        return tc_qdisc_modify(RTM_NEWQDISC, 0, argc-1, argv+1);
+    if (matches(*argv, "replace") == 0)
+        return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_CREATE|NLM_F_REPLACE, 
argc-1, argv+1);
+    if (matches(*argv, "link") == 0)
+        return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_REPLACE, argc-1, argv+1);
+    if (matches(*argv, "delete") == 0)
+        return tc_qdisc_modify(RTM_DELQDISC, 0,  argc-1, argv+1);
+
+    fprintf(stderr, "Command \"%s\" is unknown, try \"tc qdisc help\".\n", 
*argv);
+    return -1;
+}
+
+int main(int argc, char *argv[])
+{
+    int ret;
+
+    if (rtnl_open(&rth, 0) < 0) {
+        fprintf(stderr, "Cannot open rtnetlink\n");
+        exit(1);
+    }
+
+    if (matches(argv[1], "qdisc")) {
+        usage();
+        exit(1);
+    }
+
+    argc -= 2;
+    argv += 2;
+
+    if (argc < 1) {
+        usage();
+        exit(1);
+    }
+
+    ret = do_qdisc(argc, argv);
+
+    rtnl_close(&rth);
+
+    if (ret)
+        return 1;
+
+    return 0;
+}
diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index 822f1f3..f8c0bab 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -3371,6 +3371,11 @@ void libxl__device_nic_add(libxl__egc *egc, uint32_t 
domid,
         flexarray_append(back, nic->ifname);
     }
 
+    if (nic->forwarddev) {
+        flexarray_append(back, "forwarddev");
+        flexarray_append(back, nic->forwarddev);
+    }
+
     flexarray_append(back, "mac");
     flexarray_append(back,libxl__sprintf(gc,
                                     LIBXL_MAC_FMT, LIBXL_MAC_BYTES(nic->mac)));
@@ -3494,6 +3499,7 @@ static int libxl__device_nic_from_xs_be(libxl__gc *gc,
     nic->ip = READ_BACKEND(NOGC, "ip");
     nic->bridge = READ_BACKEND(NOGC, "bridge");
     nic->script = READ_BACKEND(NOGC, "script");
+    nic->forwarddev = READ_BACKEND(NOGC, "forwarddev");
 
     /* vif_ioemu nics use the same xenstore entries as vif interfaces */
     tmp = READ_BACKEND(gc, "type");
diff --git a/tools/libxl/libxl_colo_nic.c b/tools/libxl/libxl_colo_nic.c
new file mode 100644
index 0000000..0578973
--- /dev/null
+++ b/tools/libxl/libxl_colo_nic.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (C) 2014 FUJITSU LIMITED
+ * Author: Wen Congyang <wency@xxxxxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include "libxl_osdeps.h" /* must come before any other headers */
+
+#include "libxl_internal.h"
+
+typedef struct libxl__colo_device_nic {
+    int devid;
+    const char *vif;
+} libxl__colo_device_nic;
+
+enum {
+    primary,
+    secondary,
+};
+
+
+/* ========== init() and cleanup() ========== */
+int init_subkind_colo_nic(libxl__checkpoint_devices_state *cds)
+{
+    return 0;
+}
+
+void cleanup_subkind_colo_nic(libxl__checkpoint_devices_state *cds)
+{
+}
+
+/* ========== helper functions ========== */
+static void colo_save_setup_script_cb(libxl__egc *egc,
+                                     libxl__async_exec_state *aes,
+                                     int status);
+static void colo_save_teardown_script_cb(libxl__egc *egc,
+                                         libxl__async_exec_state *aes,
+                                         int status);
+
+/*
+ * If the device has a vifname, then use that instead of
+ * the vifX.Y format.
+ * it must ONLY be used for remus because if driver domains
+ * were in use it would constitute a security vulnerability.
+ */
+static const char *get_vifname(libxl__checkpoint_device *dev,
+                               const libxl_device_nic *nic)
+{
+    const char *vifname = NULL;
+    const char *path;
+    int rc;
+
+    STATE_AO_GC(dev->cds->ao);
+
+    /* Convenience aliases */
+    const uint32_t domid = dev->cds->domid;
+
+    path = GCSPRINTF("%s/backend/vif/%d/%d/vifname",
+                     libxl__xs_get_dompath(gc, 0), domid, nic->devid);
+    rc = libxl__xs_read_checked(gc, XBT_NULL, path, &vifname);
+    if (!rc && !vifname) {
+        vifname = libxl__device_nic_devname(gc, domid,
+                                            nic->devid,
+                                            nic->nictype);
+    }
+
+    return vifname;
+}
+
+/*
+ * the script needs the following env & args
+ * $vifname
+ * $XENBUS_PATH (/libxl/<domid>/colo_agent/<devid>/)
+ * $forwarddev
+ * $mode(primary/secondary)
+ * $vmid
+ * setup/teardown as command line arg.
+ */
+static void setup_async_exec(libxl__checkpoint_device *dev, char *op, int side,
+                             char *colo_agent_script)
+{
+    int arraysize, nr = 0;
+    char **env = NULL, **args = NULL;
+    libxl__colo_device_nic *colo_nic = dev->concrete_data;
+    libxl__checkpoint_devices_state *cds = dev->cds;
+    libxl__async_exec_state *aes = &dev->aodev.aes;
+    const libxl_device_nic *nic = dev->backend_dev;
+
+    STATE_AO_GC(cds->ao);
+
+    /* Convenience aliases */
+    const uint32_t domid = cds->domid;
+    const int devid = colo_nic->devid;
+    const char *const vif = colo_nic->vif;
+
+    arraysize = 11;
+    GCNEW_ARRAY(env, arraysize);
+    env[nr++] = "vifname";
+    env[nr++] = libxl__strdup(gc, vif);
+    env[nr++] = "XENBUS_PATH";
+    env[nr++] = GCSPRINTF("%s/colo_agent/%d",
+                          libxl__xs_libxl_path(gc, domid), devid);
+    env[nr++] = "forwarddev";
+    env[nr++] = libxl__strdup(gc, nic->forwarddev);
+    env[nr++] = "mode";
+    if (side == primary)
+        env[nr++] = "primary";
+    else
+        env[nr++] = "secondary";
+    env[nr++] = "vmid";
+    env[nr++] = GCSPRINTF("%u", domid);
+    env[nr++] = NULL;
+    assert(nr == arraysize);
+
+    arraysize = 3; nr = 0;
+    GCNEW_ARRAY(args, arraysize);
+    args[nr++] = colo_agent_script;
+    args[nr++] = op;
+    args[nr++] = NULL;
+    assert(nr == arraysize);
+
+    aes->ao = dev->cds->ao;
+    aes->what = GCSPRINTF("%s %s", args[0], args[1]);
+    aes->env = env;
+    aes->args = args;
+    aes->timeout_ms = LIBXL_HOTPLUG_TIMEOUT * 1000;
+    aes->stdfds[0] = -1;
+    aes->stdfds[1] = -1;
+    aes->stdfds[2] = -1;
+
+    if (!strcmp(op, "teardown"))
+        aes->callback = colo_save_teardown_script_cb;
+    else
+        aes->callback = colo_save_setup_script_cb;
+}
+
+/* ========== setup() and teardown() ========== */
+static void colo_nic_setup(libxl__egc *egc, libxl__checkpoint_device *dev,
+                           int side, char *colo_agent_script)
+{
+    int rc;
+    libxl__colo_device_nic *colo_nic;
+    const libxl_device_nic *nic = dev->backend_dev;
+
+    STATE_AO_GC(dev->cds->ao);
+
+    /*
+     * thers's no subkind of nic devices, so nic ops is always matched
+     * with nic devices, we begin to setup the nic device
+     */
+    dev->matched = 1;
+
+    if (!nic->forwarddev) {
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
+    GCNEW(colo_nic);
+    dev->concrete_data = colo_nic;
+    colo_nic->devid = nic->devid;
+    colo_nic->vif = get_vifname(dev, nic);
+    if (!colo_nic->vif) {
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
+    setup_async_exec(dev, "setup", side, colo_agent_script);
+    rc = libxl__async_exec_start(gc, &dev->aodev.aes);
+    if (rc)
+        goto out;
+
+    return;
+
+out:
+    dev->aodev.rc = rc;
+    dev->aodev.callback(egc, &dev->aodev);
+}
+
+static void colo_save_setup_script_cb(libxl__egc *egc,
+                                      libxl__async_exec_state *aes,
+                                      int status)
+{
+    libxl__ao_device *aodev = CONTAINER_OF(aes, *aodev, aes);
+    libxl__checkpoint_device *dev = CONTAINER_OF(aodev, *dev, aodev);
+    libxl__colo_device_nic *colo_nic = dev->concrete_data;
+    libxl__checkpoint_devices_state *cds = dev->cds;
+    const char *out_path_base, *hotplug_error = NULL;
+    int rc;
+
+    STATE_AO_GC(cds->ao);
+
+    /* Convenience aliases */
+    const uint32_t domid = cds->domid;
+    const int devid = colo_nic->devid;
+    const char *const vif = colo_nic->vif;
+
+    out_path_base = GCSPRINTF("%s/colo_agent/%d",
+                              libxl__xs_libxl_path(gc, domid), devid);
+
+    rc = libxl__xs_read_checked(gc, XBT_NULL,
+                                GCSPRINTF("%s/hotplug-error", out_path_base),
+                                &hotplug_error);
+    if (rc)
+        goto out;
+
+    if (hotplug_error) {
+        LOG(ERROR, "colo_agent script %s setup failed for vif %s: %s",
+            aes->args[0], vif, hotplug_error);
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
+    if (status) {
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
+    rc = 0;
+
+out:
+    aodev->rc = rc;
+    aodev->callback(egc, aodev);
+}
+
+static void colo_nic_teardown(libxl__egc *egc, libxl__checkpoint_device *dev,
+                              int side, char *colo_agent_script)
+{
+    int rc;
+    STATE_AO_GC(dev->cds->ao);
+
+    setup_async_exec(dev, "teardown", side, colo_agent_script);
+
+    rc = libxl__async_exec_start(gc, &dev->aodev.aes);
+    if (rc)
+        goto out;
+
+    return;
+
+out:
+    dev->aodev.rc = rc;
+    dev->aodev.callback(egc, &dev->aodev);
+}
+
+static void colo_save_teardown_script_cb(libxl__egc *egc,
+                                         libxl__async_exec_state *aes,
+                                         int status)
+{
+    int rc;
+    libxl__ao_device *aodev = CONTAINER_OF(aes, *aodev, aes);
+
+    if (status)
+        rc = ERROR_FAIL;
+    else
+        rc = 0;
+
+    aodev->rc = rc;
+    aodev->callback(egc, aodev);
+}
+
+/* ======== primary ======== */
+static void colo_nic_save_setup(libxl__egc *egc, libxl__checkpoint_device *dev)
+{
+    libxl__colo_save_state *css = CONTAINER_OF(dev->cds, *css, cds);
+
+    colo_nic_setup(egc, dev, primary, css->colo_agent_script);
+}
+
+static void colo_nic_save_teardown(libxl__egc *egc,
+                                   libxl__checkpoint_device *dev)
+{
+    libxl__colo_save_state *css = CONTAINER_OF(dev->cds, *css, cds);
+
+    colo_nic_teardown(egc, dev, primary, css->colo_agent_script);
+}
+
+const libxl__checkpoint_device_instance_ops colo_save_device_nic = {
+    .kind = LIBXL__DEVICE_KIND_VIF,
+    .setup = colo_nic_save_setup,
+    .teardown = colo_nic_save_teardown,
+};
diff --git a/tools/libxl/libxl_colo_save.c b/tools/libxl/libxl_colo_save.c
index 516b913..b9680a3 100644
--- a/tools/libxl/libxl_colo_save.c
+++ b/tools/libxl/libxl_colo_save.c
@@ -19,12 +19,162 @@
 #include "libxl_colo.h"
 
 extern const libxl__checkpoint_device_instance_ops 
colo_save_device_blktap2_disk;
+extern const libxl__checkpoint_device_instance_ops colo_save_device_nic;
 
 static const libxl__checkpoint_device_instance_ops *colo_ops[] = {
     &colo_save_device_blktap2_disk,
+    &colo_save_device_nic,
     NULL,
 };
 
+/* ================= colo-agent: setup, wait and teardown ================= */
+static void colo_start_new_checkpoint(libxl__egc *egc,
+                                      libxl__checkpoint_devices_state *cds,
+                                      int rc);
+static void colo_agent_async_wait_for_checkpoint(libxl__colo_save_state *css);
+static void colo_agent_async_call_done(libxl__egc *egc,
+                                       libxl__ev_child *child,
+                                       int pid,
+                                       int status);
+
+#define COMP_IOC_MAGIC          'k'
+#define COMP_IOCTWAIT           _IO(COMP_IOC_MAGIC, 0)
+#define COMP_IOCTFLUSH          _IO(COMP_IOC_MAGIC, 1)
+#define COMP_IOCTRESUME         _IO(COMP_IOC_MAGIC, 2)
+
+#define COLO_IO                 0x33
+#define COLO_CREATE_VM          _IO(COLO_IO, 0x00)
+#define COLO_RELEASE_VM         _IO(COLO_IO, 0x01)
+
+#define COMP_IOCTWAIT_TIMEOUT   5000
+
+static int colo_agent_setup(libxl__colo_save_state *css, int domid)
+{
+    int ret;
+
+    STATE_AO_GC(css->cds.ao);
+
+    css->fd = open("/dev/HA_compare", O_RDWR);
+    if (css->fd < 0) {
+        LOG(ERROR, "cannot open /dev/HA_compare");
+        return ERROR_FAIL;
+    }
+
+    ret = ioctl(css->fd, COLO_CREATE_VM, domid);
+    if (ret < 0) {
+        LOG(ERROR, "cannot pass vmid to colo-agent");
+        goto out;
+    }
+
+    css->vm_fd = ret;
+
+    return 0;
+
+out:
+    close(css->fd);
+    css->fd = -1;
+    return ERROR_FAIL;
+}
+
+static void colo_agent_preresume(libxl__colo_save_state *css)
+{
+    ioctl(css->vm_fd, COMP_IOCTFLUSH);
+}
+
+static void colo_agent_postresume(libxl__colo_save_state *css)
+{
+    ioctl(css->vm_fd, COMP_IOCTRESUME);
+}
+
+static void colo_agent_async_call(libxl__egc *egc,
+                                  libxl__colo_save_state *css,
+                                  void func(libxl__colo_save_state *),
+                                  libxl__ev_child_callback callback)
+{
+    int pid = -1, rc;
+
+    STATE_AO_GC(css->cds.ao);
+
+    /* Fork and call */
+    pid = libxl__ev_child_fork(gc, &css->child, callback);
+    if (pid == -1) {
+        LOG(ERROR, "unable to fork");
+        rc = ERROR_FAIL;
+        goto out;
+    }
+
+    if (!pid) {
+        /* child */
+        func(css);
+        /* notreached */
+        abort();
+    }
+
+    return;
+
+out:
+    callback(egc, &css->child, -1, 1);
+}
+
+static void colo_agent_wait_for_checkpoint(libxl__egc *egc,
+                                           libxl__colo_save_state *css)
+{
+    colo_agent_async_call(egc, css,
+                          colo_agent_async_wait_for_checkpoint,
+                          colo_agent_async_call_done);
+}
+
+static void colo_agent_async_wait_for_checkpoint(libxl__colo_save_state *css)
+{
+    int ret;
+
+again:
+    ret = ioctl(css->vm_fd, COMP_IOCTWAIT, COMP_IOCTWAIT_TIMEOUT);
+    if (ret < 0) {
+        if (errno == ERESTART)
+            goto again;
+
+        if (errno == ETIME)
+            _exit(0);
+
+        _exit(1);
+    }
+
+    _exit(0);
+}
+
+static void colo_agent_async_call_done(libxl__egc *egc,
+                                       libxl__ev_child *child,
+                                       int pid,
+                                       int status)
+{
+    libxl__colo_save_state *css = CONTAINER_OF(child, *css, child);
+
+    EGC_GC;
+
+    if (status) {
+        LOG(ERROR, "failed to wait for new checkpoint");
+        colo_start_new_checkpoint(egc, &css->cds, ERROR_FAIL);
+        return;
+    }
+
+    colo_start_new_checkpoint(egc, &css->cds, 0);
+}
+
+static void colo_agent_teardown(libxl__colo_save_state *css, int domid)
+{
+    if (css->vm_fd >= 0) {
+        close(css->vm_fd);
+        css->vm_fd = -1;
+        ioctl(css->fd, COLO_RELEASE_VM, domid);
+    }
+
+    if (css->fd >= 0) {
+        close(css->fd);
+        css->fd = -1;
+    }
+}
+
 /* ================= helper functions ================= */
 static int init_device_subkind(libxl__checkpoint_devices_state *cds)
 {
@@ -32,6 +182,9 @@ static int 
init_device_subkind(libxl__checkpoint_devices_state *cds)
     int rc;
     STATE_AO_GC(cds->ao);
 
+    rc = init_subkind_colo_nic(cds);
+    if (rc) goto out;
+
     rc = init_subkind_drbd_disk(cds);
     if (rc) goto out;
 
@@ -46,6 +199,7 @@ static void 
cleanup_device_subkind(libxl__checkpoint_devices_state *cds)
     STATE_AO_GC(cds->ao);
 
     cleanup_subkind_blktap2_disk(cds);
+    cleanup_subkind_colo_nic(cds);
 }
 
 /* ================= colo: setup save environment ================= */
@@ -73,9 +227,12 @@ void libxl__colo_save_setup(libxl__egc *egc, 
libxl__colo_save_state *css)
     css->send_fd = dss->fd;
     css->recv_fd = dss->recv_fd;
     css->svm_running = false;
+    css->fd = -1;
+    css->vm_fd = -1;
+    libxl__ev_child_init(&css->child);
 
-    /* TODO: nic support */
-    cds->device_kind_flags = (1 << LIBXL__DEVICE_KIND_VBD);
+    cds->device_kind_flags = (1 << LIBXL__DEVICE_KIND_VBD) |
+                             (1 << LIBXL__DEVICE_KIND_VIF);
     cds->ops = colo_ops;
     cds->callback = colo_save_setup_done;
     cds->ao = ao;
@@ -101,12 +258,17 @@ static void colo_save_setup_done(libxl__egc *egc,
     STATE_AO_GC(cds->ao);
 
     if (!rc) {
+        rc = colo_agent_setup(css, dss->domid);
+        if (rc)
+            goto failed;
         libxl__domain_suspend(egc, dss);
         return;
     }
 
     LOG(ERROR, "COLO: failed to setup device for guest with domid %u",
         dss->domid);
+
+failed:
     css->cds.callback = colo_save_setup_failed;
     libxl__checkpoint_devices_teardown(egc, &css->cds);
 }
@@ -154,6 +316,7 @@ static void colo_teardown_done(libxl__egc *egc,
     libxl__domain_suspend_state *dss = CONTAINER_OF(css, *dss, css);
 
     cleanup_device_subkind(cds);
+    colo_agent_teardown(css, dss->domid);
     dss->callback(egc, dss, rc);
 }
 
@@ -420,6 +583,8 @@ static void colo_read_svm_ready_done(libxl__egc *egc,
         goto out;
     }
 
+    colo_agent_preresume(css);
+
     css->svm_running = true;
     css->cds.callback = colo_preresume_cb;
     libxl__checkpoint_devices_preresume(egc, &css->cds);
@@ -488,6 +653,8 @@ static void colo_read_svm_resumed_done(libxl__egc *egc,
         goto out;
     }
 
+    colo_agent_postresume(css);
+
     ok = 1;
 
 out:
@@ -505,9 +672,6 @@ out:
 static void colo_device_commit_cb(libxl__egc *egc,
                                   libxl__checkpoint_devices_state *cds,
                                   int rc);
-static void colo_start_new_checkpoint(libxl__egc *egc,
-                                      libxl__checkpoint_devices_state *cds,
-                                      int rc);
 static void colo_send_data_done(libxl__egc *egc,
                                 libxl__datacopier_state *dc,
                                 int onwrite, int errnoval);
@@ -539,8 +703,7 @@ static void colo_device_commit_cb(libxl__egc *egc,
         goto out;
     }
 
-    /* TODO: wait a new checkpoint */
-    colo_start_new_checkpoint(egc, cds, 0);
+    colo_agent_wait_for_checkpoint(egc, css);
     return;
 
 out:
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 1659845..cdd8d1e 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2719,6 +2719,8 @@ int 
init_subkind_drbd_disk(libxl__checkpoint_devices_state *cds);
 void cleanup_subkind_drbd_disk(libxl__checkpoint_devices_state *cds);
 int init_subkind_blktap2_disk(libxl__checkpoint_devices_state *cds);
 void cleanup_subkind_blktap2_disk(libxl__checkpoint_devices_state *cds);
+int init_subkind_colo_nic(libxl__checkpoint_devices_state *cds);
+void cleanup_subkind_colo_nic(libxl__checkpoint_devices_state *cds);
 
 typedef void libxl__checkpoint_callback(libxl__egc *,
                                         libxl__checkpoint_devices_state *,
@@ -2834,6 +2836,7 @@ struct libxl__colo_save_state {
     libxl__checkpoint_devices_state cds;
     int send_fd;
     int recv_fd;
+    char *colo_agent_script;
 
     /* private */
     libxl__datacopier_state dc;
@@ -2845,6 +2848,11 @@ struct libxl__colo_save_state {
     uint8_t temp_buff[9];
     void (*callback)(libxl__egc *, libxl__colo_save_state *);
     bool svm_running;
+
+    /* private, used by colo-agent */
+    int fd;
+    int vm_fd;
+    libxl__ev_child child;
 };
 
 /*----- Domain suspend (save) state structure -----*/
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index 54e1684..7ee3333 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -514,6 +514,7 @@ libxl_device_nic = Struct("device_nic", [
     ("rate_bytes_per_interval", uint64),
     ("rate_interval_usecs", uint32),
     ("gatewaydev", string),
+    ("forwarddev", string)
     ])
 
 libxl_device_pci = Struct("device_pci", [
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index ec77217..7a47dd2 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -1615,6 +1615,9 @@ static void parse_config_data(const char *config_source,
                 } else if (!strcmp(p, "gatewaydev")) {
                     free(nic->gatewaydev);
                     nic->gatewaydev = strdup(p2 + 1);
+                } else if (!strcmp(p, "forwarddev")) {
+                    free(nic->forwarddev);
+                    nic->forwarddev = strdup(p2 + 1);
                 }
             } while ((p = strtok(NULL, ",")) != NULL);
 skip_nic:
-- 
1.9.3


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.