|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH 3 of 3] tools/libxl: xl remus/remus-receive commands
# HG changeset patch
# User Shriram Rajagopalan <rshriram@xxxxxxxxx>
# Date 1327358642 28800
# Node ID 822536df4aeced5aee00f1f26299086faa622681
# Parent 0446591bee86eb4e767d75b70c885549c7a3cfef
tools/libxl: xl remus/remus-receive commands
* xl remus (and its receive counterpart remus-receive) act as frontends
to enable remus for a given domain.
* At the moment, only memory checkpointing and blackhole replication are
supported. Support for disk checkpointing and network buffering will
be added in future.
* xl remus borrows some aspects of xl migrate. Replication is currently
done over a ssh connection. Future versions will use a low-overhead
plain tcp socket for replication (similar to xend/remus).
Signed-off-by: Shriram Rajagopalan <rshriram@xxxxxxxxx>
diff -r 0446591bee86 -r 822536df4aec tools/libxl/libxl.c
--- a/tools/libxl/libxl.c Mon Jan 23 14:44:00 2012 -0800
+++ b/tools/libxl/libxl.c Mon Jan 23 14:44:02 2012 -0800
@@ -466,6 +466,40 @@
return ptr;
}
+int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info,
+ uint32_t domid, int fd)
+{
+ GC_INIT(ctx);
+ libxl_domain_type type = libxl__domain_type(gc, domid);
+ int rc = 0;
+
+ if (info == NULL) {
+ LIBXL__LOG(ctx, LIBXL__LOG_ERROR,
+ "No remus_info structure supplied for domain %d", domid);
+ rc = -1;
+ goto remus_fail;
+ }
+
+ /* TBD: Remus setup - i.e. attach qdisc, enable disk buffering, etc */
+
+ /* Point of no return */
+ rc = libxl__domain_suspend_common(gc, domid, fd, type, /* live */ 1,
+ /* debug */ 0, info);
+
+ /*
+ * With Remus, if we reach this point, it means either
+ * backup died or some network error occurred preventing us
+ * from sending checkpoints.
+ */
+
+ /* TBD: Remus cleanup - i.e. detach qdisc, release other
+ * resources.
+ */
+ remus_fail:
+ GC_FREE;
+ return rc;
+}
+
int libxl_domain_suspend(libxl_ctx *ctx, libxl_domain_suspend_info *info,
uint32_t domid, int fd)
{
diff -r 0446591bee86 -r 822536df4aec tools/libxl/libxl.h
--- a/tools/libxl/libxl.h Mon Jan 23 14:44:00 2012 -0800
+++ b/tools/libxl/libxl.h Mon Jan 23 14:44:02 2012 -0800
@@ -272,6 +272,8 @@
int libxl_domain_create_new(libxl_ctx *ctx, libxl_domain_config *d_config,
libxl_console_ready cb, void *priv, uint32_t *domid);
int libxl_domain_create_restore(libxl_ctx *ctx, libxl_domain_config *d_config,
libxl_console_ready cb, void *priv, uint32_t *domid, int restore_fd);
void libxl_domain_config_dispose(libxl_domain_config *d_config);
+int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info,
+ uint32_t domid, int fd);
int libxl_domain_suspend(libxl_ctx *ctx, libxl_domain_suspend_info *info,
uint32_t domid, int fd);
int libxl_domain_resume(libxl_ctx *ctx, uint32_t domid);
diff -r 0446591bee86 -r 822536df4aec tools/libxl/xl.h
--- a/tools/libxl/xl.h Mon Jan 23 14:44:00 2012 -0800
+++ b/tools/libxl/xl.h Mon Jan 23 14:44:02 2012 -0800
@@ -93,6 +93,8 @@
int main_getenforce(int argc, char **argv);
int main_setenforce(int argc, char **argv);
int main_loadpolicy(int argc, char **argv);
+int main_remus_receive(int argc, char **argv);
+int main_remus(int argc, char **argv);
void help(const char *command);
diff -r 0446591bee86 -r 822536df4aec tools/libxl/xl_cmdimpl.c
--- a/tools/libxl/xl_cmdimpl.c Mon Jan 23 14:44:00 2012 -0800
+++ b/tools/libxl/xl_cmdimpl.c Mon Jan 23 14:44:02 2012 -0800
@@ -1814,7 +1814,7 @@
* If we have daemonized then do not return to the caller -- this has
* already happened in the parent.
*/
- if ( !need_daemon )
+ if ( daemonize && !need_daemon )
exit(ret);
return ret;
@@ -5853,6 +5853,175 @@
return ret;
}
+int main_remus_receive(int argc, char **argv)
+{
+ int rc;
+ char *migration_domname;
+ struct domain_create dom_info;
+
+ signal(SIGPIPE, SIG_IGN);
+ memset(&dom_info, 0, sizeof(dom_info));
+ dom_info.debug = 1;
+ dom_info.no_incr_generationid = 1;
+ dom_info.restore_file = "incoming checkpoint stream";
+ dom_info.migrate_fd = 0; /* stdin - will change in future*/
+ dom_info.migration_domname_r = &migration_domname;
+
+ rc = create_domain(&dom_info);
+ if (rc < 0) {
+ fprintf(stderr, "migration target (Remus): Domain creation failed"
+ " (code %d) domid %u.\n", rc, domid);
+ exit(-rc);
+ }
+
+ /* If we are here, it means that the sender (primary) has crashed.
+ * If domain renaming fails, lets just continue (as we need the domain
+ * to be up & dom names may not matter much, as long as its reachable
+ * over network).
+ *
+ * If domain unpausing fails, destroy domain ? Or is it better to have
+ * a consistent copy of the domain (memory, cpu state, disk)
+ * on atleast one physical host ? Right now, lets just leave the domain
+ * as is and let the Administrator decide (or troubleshoot).
+ */
+ fprintf(stderr, "migration target: Remus Failover for domain %u\n", domid);
+ if (migration_domname) {
+ rc = libxl_domain_rename(ctx, domid, migration_domname,
+ common_domname);
+ if (rc)
+ fprintf(stderr, "migration target (Remus): "
+ "Failed to rename domain from %s to %s:%d\n",
+ migration_domname, common_domname, rc);
+
+ rc = libxl_domain_unpause(ctx, domid);
+ if (rc)
+ fprintf(stderr, "migration target (Remus): "
+ "Failed to unpause domain %s (id: %u):%d\n",
+ common_domname, domid, rc);
+ }
+
+ return -rc;
+}
+
+int main_remus(int argc, char **argv)
+{
+ int opt, rc;
+ const char *ssh_command = "ssh";
+ char *host = NULL, *rune = NULL, *domain = NULL;
+
+ int sendpipe[2], recvpipe[2];
+ int send_fd = -1, recv_fd = -1;
+ pid_t child = -1;
+
+ uint8_t *config_data;
+ int config_len;
+
+ libxl_domain_remus_info r_info;
+
+ memset(&r_info, 0, sizeof(libxl_domain_remus_info));
+ /* Defaults */
+ r_info.interval = 200;
+ r_info.blackhole = 0;
+ r_info.compression = 1;
+
+ while ((opt = def_getopt(argc, argv, "bui:s:", "remus", 2)) != -1) {
+ switch (opt) {
+ case 0: case 2:
+ return opt;
+
+ case 'i':
+ r_info.interval = atoi(optarg);
+ break;
+ case 'b':
+ r_info.blackhole = 1;
+ break;
+ case 'u':
+ r_info.compression = 0;
+ break;
+ case 's':
+ ssh_command = optarg;
+ break;
+ }
+ }
+
+ domain = argv[optind];
+ host = argv[optind + 1];
+
+ if (r_info.blackhole) {
+ find_domain(domain);
+ send_fd = open("/dev/null", O_RDWR, 0644);
+ if (send_fd < 0) {
+ perror("failed to open /dev/null");
+ exit(-1);
+ }
+ } else {
+
+ if (!ssh_command[0]) {
+ rune = host;
+ } else {
+ if (asprintf(&rune, "exec %s %s xl remus-receive",
+ ssh_command, host) < 0)
+ return 1;
+ }
+
+ save_domain_core_begin(domain, NULL, &config_data, &config_len);
+
+ if (!config_len) {
+ fprintf(stderr, "No config file stored for running domain and "
+ "none supplied - cannot start remus.\n");
+ exit(1);
+ }
+
+ MUST( libxl_pipe(ctx, sendpipe) );
+ MUST( libxl_pipe(ctx, recvpipe) );
+
+ child = libxl_fork(ctx);
+ if (child==-1) exit(1);
+
+ /* TODO: change this to plain TCP socket based channel
+ * instead of SSH.
+ */
+ if (!child) {
+ dup2(sendpipe[0], 0);
+ dup2(recvpipe[1], 1);
+ close(sendpipe[0]); close(sendpipe[1]);
+ close(recvpipe[0]); close(recvpipe[1]);
+ execlp("sh","sh","-c",rune,(char*)0);
+ perror("failed to exec sh");
+ exit(-1);
+ }
+
+ close(sendpipe[0]);
+ close(recvpipe[1]);
+ send_fd = sendpipe[1];
+ recv_fd = recvpipe[0];
+
+ signal(SIGPIPE, SIG_IGN);
+
+ save_domain_core_writeconfig(send_fd, "migration stream",
+ config_data, config_len);
+ }
+
+ /* Point of no return */
+ rc = libxl_domain_remus_start(ctx, &r_info, domid, send_fd);
+
+ /* If we are here, it means backup has failed/domain suspend failed.
+ * Try to resume the domain and exit gracefully.
+ */
+ fprintf(stderr, "remus sender: libxl_domain_suspend failed"
+ " (rc=%d)\n", rc);
+
+ if (rc == ERROR_GUEST_TIMEDOUT)
+ fprintf(stderr, "Failed to suspend domain at primary.\n");
+ else {
+ fprintf(stderr, "Remus: Backup failed? resuming domain at primary.\n");
+ libxl_domain_resume(ctx, domid);
+ }
+
+ close(send_fd);
+ return -ERROR_FAIL;
+}
+
/*
* Local variables:
* mode: C
diff -r 0446591bee86 -r 822536df4aec tools/libxl/xl_cmdtable.c
--- a/tools/libxl/xl_cmdtable.c Mon Jan 23 14:44:00 2012 -0800
+++ b/tools/libxl/xl_cmdtable.c Mon Jan 23 14:44:02 2012 -0800
@@ -407,6 +407,22 @@
"Loads a new policy int the Flask Xen security module",
"<policy file>",
},
+ { "remus-receive",
+ &main_remus_receive, 0,
+ "Remus Checkpoint Receiver",
+ "- for internal use only",
+ },
+ { "remus",
+ &main_remus, 0,
+ "Enable Remus HA for domain",
+ "[options] <Domain> [<DestinationHost>]",
+ "-i MS Checkpoint domain memory every MS milliseconds
(def. 200ms).\n"
+ "-b Replicate memory checkpoints to /dev/null
(blackhole)\n"
+ "-u Disable memory checkpoint compression.\n"
+ "-s <sshcommand> Use <sshcommand> instead of ssh. String will
be passed\n"
+ " to sh. If empty, run <host> instead of \n"
+ " ssh <host> xl remus-receive\n"
+ },
};
int cmdtable_len = sizeof(cmd_table)/sizeof(struct cmd_spec);
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |