[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH 3 of 3] tools/libxl: xl remus/remus-receive commands
# HG changeset patch # User Shriram Rajagopalan <rshriram@xxxxxxxxx> # Date 1327358642 28800 # Node ID 822536df4aeced5aee00f1f26299086faa622681 # Parent 0446591bee86eb4e767d75b70c885549c7a3cfef tools/libxl: xl remus/remus-receive commands * xl remus (and its receive counterpart remus-receive) act as frontends to enable remus for a given domain. * At the moment, only memory checkpointing and blackhole replication are supported. Support for disk checkpointing and network buffering will be added in future. * xl remus borrows some aspects of xl migrate. Replication is currently done over a ssh connection. Future versions will use a low-overhead plain tcp socket for replication (similar to xend/remus). Signed-off-by: Shriram Rajagopalan <rshriram@xxxxxxxxx> diff -r 0446591bee86 -r 822536df4aec tools/libxl/libxl.c --- a/tools/libxl/libxl.c Mon Jan 23 14:44:00 2012 -0800 +++ b/tools/libxl/libxl.c Mon Jan 23 14:44:02 2012 -0800 @@ -466,6 +466,40 @@ return ptr; } +int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info, + uint32_t domid, int fd) +{ + GC_INIT(ctx); + libxl_domain_type type = libxl__domain_type(gc, domid); + int rc = 0; + + if (info == NULL) { + LIBXL__LOG(ctx, LIBXL__LOG_ERROR, + "No remus_info structure supplied for domain %d", domid); + rc = -1; + goto remus_fail; + } + + /* TBD: Remus setup - i.e. attach qdisc, enable disk buffering, etc */ + + /* Point of no return */ + rc = libxl__domain_suspend_common(gc, domid, fd, type, /* live */ 1, + /* debug */ 0, info); + + /* + * With Remus, if we reach this point, it means either + * backup died or some network error occurred preventing us + * from sending checkpoints. + */ + + /* TBD: Remus cleanup - i.e. detach qdisc, release other + * resources. + */ + remus_fail: + GC_FREE; + return rc; +} + int libxl_domain_suspend(libxl_ctx *ctx, libxl_domain_suspend_info *info, uint32_t domid, int fd) { diff -r 0446591bee86 -r 822536df4aec tools/libxl/libxl.h --- a/tools/libxl/libxl.h Mon Jan 23 14:44:00 2012 -0800 +++ b/tools/libxl/libxl.h Mon Jan 23 14:44:02 2012 -0800 @@ -272,6 +272,8 @@ int libxl_domain_create_new(libxl_ctx *ctx, libxl_domain_config *d_config, libxl_console_ready cb, void *priv, uint32_t *domid); int libxl_domain_create_restore(libxl_ctx *ctx, libxl_domain_config *d_config, libxl_console_ready cb, void *priv, uint32_t *domid, int restore_fd); void libxl_domain_config_dispose(libxl_domain_config *d_config); +int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info, + uint32_t domid, int fd); int libxl_domain_suspend(libxl_ctx *ctx, libxl_domain_suspend_info *info, uint32_t domid, int fd); int libxl_domain_resume(libxl_ctx *ctx, uint32_t domid); diff -r 0446591bee86 -r 822536df4aec tools/libxl/xl.h --- a/tools/libxl/xl.h Mon Jan 23 14:44:00 2012 -0800 +++ b/tools/libxl/xl.h Mon Jan 23 14:44:02 2012 -0800 @@ -93,6 +93,8 @@ int main_getenforce(int argc, char **argv); int main_setenforce(int argc, char **argv); int main_loadpolicy(int argc, char **argv); +int main_remus_receive(int argc, char **argv); +int main_remus(int argc, char **argv); void help(const char *command); diff -r 0446591bee86 -r 822536df4aec tools/libxl/xl_cmdimpl.c --- a/tools/libxl/xl_cmdimpl.c Mon Jan 23 14:44:00 2012 -0800 +++ b/tools/libxl/xl_cmdimpl.c Mon Jan 23 14:44:02 2012 -0800 @@ -1814,7 +1814,7 @@ * If we have daemonized then do not return to the caller -- this has * already happened in the parent. */ - if ( !need_daemon ) + if ( daemonize && !need_daemon ) exit(ret); return ret; @@ -5853,6 +5853,175 @@ return ret; } +int main_remus_receive(int argc, char **argv) +{ + int rc; + char *migration_domname; + struct domain_create dom_info; + + signal(SIGPIPE, SIG_IGN); + memset(&dom_info, 0, sizeof(dom_info)); + dom_info.debug = 1; + dom_info.no_incr_generationid = 1; + dom_info.restore_file = "incoming checkpoint stream"; + dom_info.migrate_fd = 0; /* stdin - will change in future*/ + dom_info.migration_domname_r = &migration_domname; + + rc = create_domain(&dom_info); + if (rc < 0) { + fprintf(stderr, "migration target (Remus): Domain creation failed" + " (code %d) domid %u.\n", rc, domid); + exit(-rc); + } + + /* If we are here, it means that the sender (primary) has crashed. + * If domain renaming fails, lets just continue (as we need the domain + * to be up & dom names may not matter much, as long as its reachable + * over network). + * + * If domain unpausing fails, destroy domain ? Or is it better to have + * a consistent copy of the domain (memory, cpu state, disk) + * on atleast one physical host ? Right now, lets just leave the domain + * as is and let the Administrator decide (or troubleshoot). + */ + fprintf(stderr, "migration target: Remus Failover for domain %u\n", domid); + if (migration_domname) { + rc = libxl_domain_rename(ctx, domid, migration_domname, + common_domname); + if (rc) + fprintf(stderr, "migration target (Remus): " + "Failed to rename domain from %s to %s:%d\n", + migration_domname, common_domname, rc); + + rc = libxl_domain_unpause(ctx, domid); + if (rc) + fprintf(stderr, "migration target (Remus): " + "Failed to unpause domain %s (id: %u):%d\n", + common_domname, domid, rc); + } + + return -rc; +} + +int main_remus(int argc, char **argv) +{ + int opt, rc; + const char *ssh_command = "ssh"; + char *host = NULL, *rune = NULL, *domain = NULL; + + int sendpipe[2], recvpipe[2]; + int send_fd = -1, recv_fd = -1; + pid_t child = -1; + + uint8_t *config_data; + int config_len; + + libxl_domain_remus_info r_info; + + memset(&r_info, 0, sizeof(libxl_domain_remus_info)); + /* Defaults */ + r_info.interval = 200; + r_info.blackhole = 0; + r_info.compression = 1; + + while ((opt = def_getopt(argc, argv, "bui:s:", "remus", 2)) != -1) { + switch (opt) { + case 0: case 2: + return opt; + + case 'i': + r_info.interval = atoi(optarg); + break; + case 'b': + r_info.blackhole = 1; + break; + case 'u': + r_info.compression = 0; + break; + case 's': + ssh_command = optarg; + break; + } + } + + domain = argv[optind]; + host = argv[optind + 1]; + + if (r_info.blackhole) { + find_domain(domain); + send_fd = open("/dev/null", O_RDWR, 0644); + if (send_fd < 0) { + perror("failed to open /dev/null"); + exit(-1); + } + } else { + + if (!ssh_command[0]) { + rune = host; + } else { + if (asprintf(&rune, "exec %s %s xl remus-receive", + ssh_command, host) < 0) + return 1; + } + + save_domain_core_begin(domain, NULL, &config_data, &config_len); + + if (!config_len) { + fprintf(stderr, "No config file stored for running domain and " + "none supplied - cannot start remus.\n"); + exit(1); + } + + MUST( libxl_pipe(ctx, sendpipe) ); + MUST( libxl_pipe(ctx, recvpipe) ); + + child = libxl_fork(ctx); + if (child==-1) exit(1); + + /* TODO: change this to plain TCP socket based channel + * instead of SSH. + */ + if (!child) { + dup2(sendpipe[0], 0); + dup2(recvpipe[1], 1); + close(sendpipe[0]); close(sendpipe[1]); + close(recvpipe[0]); close(recvpipe[1]); + execlp("sh","sh","-c",rune,(char*)0); + perror("failed to exec sh"); + exit(-1); + } + + close(sendpipe[0]); + close(recvpipe[1]); + send_fd = sendpipe[1]; + recv_fd = recvpipe[0]; + + signal(SIGPIPE, SIG_IGN); + + save_domain_core_writeconfig(send_fd, "migration stream", + config_data, config_len); + } + + /* Point of no return */ + rc = libxl_domain_remus_start(ctx, &r_info, domid, send_fd); + + /* If we are here, it means backup has failed/domain suspend failed. + * Try to resume the domain and exit gracefully. + */ + fprintf(stderr, "remus sender: libxl_domain_suspend failed" + " (rc=%d)\n", rc); + + if (rc == ERROR_GUEST_TIMEDOUT) + fprintf(stderr, "Failed to suspend domain at primary.\n"); + else { + fprintf(stderr, "Remus: Backup failed? resuming domain at primary.\n"); + libxl_domain_resume(ctx, domid); + } + + close(send_fd); + return -ERROR_FAIL; +} + /* * Local variables: * mode: C diff -r 0446591bee86 -r 822536df4aec tools/libxl/xl_cmdtable.c --- a/tools/libxl/xl_cmdtable.c Mon Jan 23 14:44:00 2012 -0800 +++ b/tools/libxl/xl_cmdtable.c Mon Jan 23 14:44:02 2012 -0800 @@ -407,6 +407,22 @@ "Loads a new policy int the Flask Xen security module", "<policy file>", }, + { "remus-receive", + &main_remus_receive, 0, + "Remus Checkpoint Receiver", + "- for internal use only", + }, + { "remus", + &main_remus, 0, + "Enable Remus HA for domain", + "[options] <Domain> [<DestinationHost>]", + "-i MS Checkpoint domain memory every MS milliseconds (def. 200ms).\n" + "-b Replicate memory checkpoints to /dev/null (blackhole)\n" + "-u Disable memory checkpoint compression.\n" + "-s <sshcommand> Use <sshcommand> instead of ssh. String will be passed\n" + " to sh. If empty, run <host> instead of \n" + " ssh <host> xl remus-receive\n" + }, }; int cmdtable_len = sizeof(cmd_table)/sizeof(struct cmd_spec); _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |