[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH 17 of 21 RESEND] blktap3/drivers: Introduce representation and management of Virtual Block Devices in tapdisk
This patch copies the representation and management of Virtual Block Devices in tapdisk from blktap2. Most changes come from blktap2.5. Also, it contains the following blktap3-related changes: * Don't set the minor number in function tapdisk_vbd_create as it has been removed. * Function tapdisk_vbd_initialize now uses the type:/path/to/file instead of the minor number. * Function signal_enospc uses the /path/to/file instead of the minor number. * Function tapdisk_vbd_open_vdi uses the type:/path/to/file instead of the minor number, and /path/to/file instead of the parent minor number. * Remove functions tapdisk_vbd_detach, tapdisk_vbd_attach, and tapdisk_vbd_open. Singed-off-by: Thanos Makatos <thanos.makatos@xxxxxxxxxx> diff --git a/tools/blktap2/drivers/tapdisk-vbd.c b/tools/blktap3/drivers/tapdisk-vbd.c copy from tools/blktap2/drivers/tapdisk-vbd.c copy to tools/blktap3/drivers/tapdisk-vbd.c --- a/tools/blktap2/drivers/tapdisk-vbd.c +++ b/tools/blktap3/drivers/tapdisk-vbd.c @@ -1,5 +1,7 @@ -/* +/* * Copyright (c) 2008, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -25,6 +27,7 @@ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + #include <stdio.h> #include <errno.h> #include <fcntl.h> @@ -34,22 +37,22 @@ #include <libgen.h> #include <sys/mman.h> #include <sys/ioctl.h> -#ifdef MEMSHR -#include <memshr.h> -#endif +#include "libvhd.h" #include "tapdisk-image.h" #include "tapdisk-driver.h" #include "tapdisk-server.h" +#include "tapdisk-vbd.h" +#include "tapdisk-disktype.h" #include "tapdisk-interface.h" -#include "tapdisk-disktype.h" -#include "tapdisk-vbd.h" -#include "blktap2.h" +#include "tapdisk-stats.h" +#include "sring/td-stats.h" +#include "tapdisk-storage.h" #define DBG(_level, _f, _a...) tlog_write(_level, _f, ##_a) #define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a) -#if 1 +#if 1 #define ASSERT(p) \ do { \ if (!(p)) { \ @@ -60,43 +63,30 @@ } while (0) #else #define ASSERT(p) ((void)0) -#endif +#endif #define TD_VBD_EIO_RETRIES 10 #define TD_VBD_EIO_SLEEP 1 #define TD_VBD_WATCHDOG_TIMEOUT 10 -static void tapdisk_vbd_ring_event(event_id_t, char, void *); -static void tapdisk_vbd_callback(void *, blkif_response_t *); +static void tapdisk_vbd_complete_vbd_request(td_vbd_t *, + td_vbd_request_t *); +static int tapdisk_vbd_queue_ready(td_vbd_t *); +static void tapdisk_vbd_check_queue_state(td_vbd_t *); -/* +/* * initialization */ -static inline void -tapdisk_vbd_initialize_vreq(td_vbd_request_t *vreq) +static void tapdisk_vbd_mark_progress(td_vbd_t * vbd) { - memset(vreq, 0, sizeof(td_vbd_request_t)); - INIT_LIST_HEAD(&vreq->next); + gettimeofday(&vbd->ts, NULL); } -void -tapdisk_vbd_free(td_vbd_t *vbd) -{ - if (vbd) { - tapdisk_vbd_free_stack(vbd); - list_del_init(&vbd->next); - free(vbd->name); - free(vbd); - } -} - -td_vbd_t* -tapdisk_vbd_create(uint16_t uuid) +td_vbd_t *tapdisk_vbd_create(void) { td_vbd_t *vbd; - int i; vbd = calloc(1, sizeof(td_vbd_t)); if (!vbd) { @@ -104,103 +94,62 @@ tapdisk_vbd_create(uint16_t uuid) return NULL; } - vbd->uuid = uuid; - vbd->minor = -1; - vbd->ring.fd = -1; - - /* default blktap ring completion */ - vbd->callback = tapdisk_vbd_callback; - vbd->argument = vbd; - -#ifdef MEMSHR - memshr_vbd_initialize(); -#endif - - INIT_LIST_HEAD(&vbd->driver_stack); - INIT_LIST_HEAD(&vbd->images); - INIT_LIST_HEAD(&vbd->new_requests); - INIT_LIST_HEAD(&vbd->pending_requests); - INIT_LIST_HEAD(&vbd->failed_requests); - INIT_LIST_HEAD(&vbd->completed_requests); - INIT_LIST_HEAD(&vbd->next); - gettimeofday(&vbd->ts, NULL); - - for (i = 0; i < MAX_REQUESTS; i++) - tapdisk_vbd_initialize_vreq(vbd->request_list + i); + TAILQ_INIT(&vbd->images); + TAILQ_INIT(&vbd->new_requests); + TAILQ_INIT(&vbd->pending_requests); + TAILQ_INIT(&vbd->failed_requests); + TAILQ_INIT(&vbd->completed_requests); + tapdisk_vbd_mark_progress(vbd); return vbd; } -int -tapdisk_vbd_initialize(uint16_t uuid) +int tapdisk_vbd_initialize(int rfd __attribute__((unused)), + int wfd __attribute__((unused)), const char * params) { td_vbd_t *vbd; - vbd = tapdisk_server_get_vbd(uuid); + assert(params); + + vbd = tapdisk_server_get_vbd(params); if (vbd) { - EPRINTF("duplicate vbds! %u\n", uuid); + EPRINTF("duplicate vbds %s\n", params); return -EEXIST; } - vbd = tapdisk_vbd_create(uuid); + vbd = tapdisk_vbd_create(); tapdisk_server_add_vbd(vbd); return 0; } -void -tapdisk_vbd_set_callback(td_vbd_t *vbd, td_vbd_cb_t callback, void *argument) +static int tapdisk_vbd_validate_chain(td_vbd_t * vbd) { - vbd->callback = callback; - vbd->argument = argument; + return tapdisk_image_validate_chain(&vbd->images); } -static int -tapdisk_vbd_validate_chain(td_vbd_t *vbd) +void tapdisk_vbd_close_vdi(td_vbd_t * vbd) { - int err; - td_image_t *image, *parent, *tmp; + tapdisk_image_close_chain(&vbd->images); - DPRINTF("VBD CHAIN:\n"); - - tapdisk_vbd_for_each_image(vbd, image, tmp) { - DPRINTF("%s: %d\n", image->name, image->type); - - if (tapdisk_vbd_is_last_image(vbd, image)) - break; - - parent = tapdisk_vbd_next_image(image); - err = td_validate_parent(image, parent); - if (err) - return err; + if (vbd->secondary && vbd->secondary_mode != TD_VBD_SECONDARY_MIRROR) { + tapdisk_image_close(vbd->secondary, NULL); + vbd->secondary = NULL; } - return 0; + if (vbd->retired) { + tapdisk_image_close(vbd->retired, NULL); + vbd->retired = NULL; + } + + td_flag_set(vbd->state, TD_VBD_CLOSED); } -void -tapdisk_vbd_close_vdi(td_vbd_t *vbd) +static int tapdisk_vbd_add_block_cache(td_vbd_t * vbd) { - td_image_t *image, *tmp; - - tapdisk_vbd_for_each_image(vbd, image, tmp) { - td_close(image); - tapdisk_image_free(image); - } - - INIT_LIST_HEAD(&vbd->images); - td_flag_set(vbd->state, TD_VBD_CLOSED); - - tapdisk_vbd_free_stack(vbd); -} - -static int -tapdisk_vbd_add_block_cache(td_vbd_t *vbd) -{ - int err; - td_driver_t *driver; td_image_t *cache, *image, *target, *tmp; + int err; target = NULL; @@ -215,10 +164,7 @@ tapdisk_vbd_add_block_cache(td_vbd_t *vb return 0; cache = tapdisk_image_allocate(target->name, - DISK_TYPE_BLOCK_CACHE, - target->storage, - target->flags, - target->private); + DISK_TYPE_BLOCK_CACHE, target->flags); if (!cache) return -ENOMEM; @@ -234,9 +180,7 @@ tapdisk_vbd_add_block_cache(td_vbd_t *vb } cache->driver = tapdisk_driver_allocate(cache->type, - cache->name, - cache->flags, - cache->storage); + cache->name, cache->flags); if (!cache->driver) { err = -ENOMEM; goto fail; @@ -251,468 +195,297 @@ tapdisk_vbd_add_block_cache(td_vbd_t *vb fail: /* give up */ - tapdisk_image_free(target); + tapdisk_image_free(target, NULL); return err; done: /* insert cache before image */ - list_add(&cache->next, target->next.prev); + TAILQ_INSERT_BEFORE(target, cache, entry); return 0; } -static int -tapdisk_vbd_add_dirty_log(td_vbd_t *vbd) +static int tapdisk_vbd_add_local_cache(td_vbd_t * vbd) { + td_image_t *cache, *parent; int err; - td_driver_t *driver; - td_image_t *log, *parent; - - driver = NULL; - log = NULL; parent = tapdisk_vbd_first_image(vbd); + if (tapdisk_vbd_is_last_image(vbd, parent)) { + DPRINTF("Single-image chain, nothing to cache"); + return 0; + } - log = tapdisk_image_allocate(parent->name, - DISK_TYPE_LOG, - parent->storage, - parent->flags, - vbd); - if (!log) + cache = tapdisk_image_allocate(parent->name, + DISK_TYPE_LCACHE, parent->flags); + + if (!cache) return -ENOMEM; - driver = tapdisk_driver_allocate(log->type, - log->name, - log->flags, - log->storage); - if (!driver) { + /* try to load existing cache */ + err = td_load(cache); + if (!err) + goto done; + + cache->driver = tapdisk_driver_allocate(cache->type, + cache->name, cache->flags); + if (!cache->driver) { err = -ENOMEM; goto fail; } - driver->info = parent->driver->info; - log->driver = driver; + cache->driver->info = parent->driver->info; - err = td_open(log); + /* try to open new cache */ + err = td_open(cache); + if (!err) + goto done; + +fail: + tapdisk_image_free(cache, NULL); + return err; + + done: + /* insert cache right above leaf image */ + TAILQ_INSERT_AFTER(&vbd->images, parent, cache, entry); + + DPRINTF("Added local_cache driver\n"); + return 0; +} + +/** + * Adds a secondary VBD to this VBD. + */ +static int tapdisk_vbd_add_secondary(td_vbd_t * vbd) +{ + td_image_t *leaf, *second = NULL; + const char *path; + int type, err; + + DPRINTF("Adding secondary image: %s\n", vbd->secondary_name); + + type = tapdisk_disktype_parse_params(vbd->secondary_name, &path); + if (type < 0) + return type; + + leaf = tapdisk_vbd_first_image(vbd); + if (!leaf) { + err = -EINVAL; + goto fail; + } + + err = tapdisk_image_open(type, path, leaf->flags, &second); + if (err) + goto fail; + + if (second->info.size != leaf->info.size) { + EPRINTF("Secondary image size %" PRIu64 " != image size %" PRIu64 + "\n", second->info.size, leaf->info.size); + err = -EINVAL; + goto fail; + } + + vbd->secondary = second; + leaf->flags |= TD_IGNORE_ENOSPC; + if (td_flag_test(vbd->flags, TD_OPEN_STANDBY)) { + DPRINTF("In standby mode\n"); + vbd->secondary_mode = TD_VBD_SECONDARY_STANDBY; + } else { + DPRINTF("In mirror mode\n"); + vbd->secondary_mode = TD_VBD_SECONDARY_MIRROR; + /* we actually need this image to also be part of the chain, + * since it may already contain data */ + TAILQ_INSERT_AFTER(&vbd->images, leaf, second, entry); + } + + DPRINTF("Added secondary image\n"); + return 0; + + fail: + if (second) + tapdisk_image_close(second, NULL); + return err; + } + +static void signal_enospc(td_vbd_t * vbd) +{ + int fd, err; + char *fn; + + /* TODO Some external tool is probably using this, figure out which and + * update it. */ + assert(vbd->name); + err = asprintf(&fn, BLKTAP3_ENOSPC_SIGNAL_FILE "%s", vbd->name); + if (err == -1) { + EPRINTF("Failed to signal ENOSPC condition\n"); + return; + } + + fd = open(fn, O_WRONLY | O_CREAT | O_NONBLOCK, 0666); + if (fd == -1) + EPRINTF("Failed to open file to signal ENOSPC condition\n"); + else + close(fd); + + free(fn); +} + +/* XXX This is commented out in blktap2.5. */ +#if 0 +static int tapdisk_vbd_open_index(td_vbd_t * vbd) +{ + int err; + char *path; + td_flag_t flags; + td_image_t *last, *image; + + last = tapdisk_vbd_last_image(vbd); + err = asprintf(&path, "%s.bat", last->name); + if (err == -1) + return -errno; + + err = access(path, R_OK); + if (err == -1) { + free(path); + return -errno; + } + + flags = vbd->flags | TD_OPEN_RDONLY | TD_OPEN_SHAREABLE; + image = tapdisk_image_allocate(path, DISK_TYPE_VINDEX, flags); + if (!image) { + err = -ENOMEM; + goto fail; + } + + err = td_open(image); if (err) goto fail; - list_add(&log->next, &vbd->images); + tapdisk_vbd_add_image(vbd, image); return 0; fail: - tapdisk_image_free(log); + if (image) + tapdisk_image_free(image); + free(path); return err; } +#endif -static int -tapdisk_vbd_open_level(td_vbd_t *vbd, struct list_head *head, - const char *params, int driver_type, - td_disk_info_t *driver_info, td_flag_t flags) +static int tapdisk_vbd_add_dirty_log(td_vbd_t * vbd) { - const char *name; - int type, err; - td_image_t *image; - td_disk_id_t id; - td_driver_t *driver; + int err; + td_driver_t *driver; + td_image_t *log, *parent; - name = params; - id.name = NULL; - type = driver_type; - INIT_LIST_HEAD(head); + driver = NULL; + log = NULL; - for (;;) { - err = -ENOMEM; - image = tapdisk_image_allocate(name, type, - vbd->storage, flags, vbd); + parent = tapdisk_vbd_first_image(vbd); - free(id.name); + log = tapdisk_image_allocate(parent->name, + DISK_TYPE_LOG, parent->flags); + if (!log) + return -ENOMEM; - if (!image) - goto out; - - - /* this breaks if a driver modifies its info within a layer */ - err = __td_open(image, driver_info); - if (err) - goto out; - - /* TODO: non-sink drivers that don't care about their child - * currently return EINVAL. Could return TD_PARENT_OK or - * TD_ANY_PARENT */ - - err = td_get_parent_id(image, &id); - if (err && (err != TD_NO_PARENT && err != -EINVAL)) { - td_close(image); - goto out; - } - - /* add this image to the end of the list */ - list_add_tail(&image->next, head); - image = NULL; - - /* if the image does not have a parent we return the - * list of images generated by this level of the stack */ - if (err == TD_NO_PARENT || err == -EINVAL) { - err = 0; - goto out; - } - - name = id.name; - type = id.drivertype; - - flags |= (TD_OPEN_RDONLY | TD_OPEN_SHAREABLE); - } - -out: - if (err) { - if (image) { - td_close(image); - tapdisk_image_free(image); - } - while (!list_empty(head)) { - image = list_entry(&head->next, td_image_t, next); - td_close(image); - tapdisk_image_free(image); - } - } - - return err; + driver = tapdisk_driver_allocate(log->type, log->name, log->flags); + if (!driver) { + err = -ENOMEM; + goto fail; } -static int -__tapdisk_vbd_open_vdi(td_vbd_t *vbd, td_flag_t extra_flags) -{ - int err; - td_flag_t flags; - td_image_t *tmp; - td_vbd_driver_info_t *driver_info; - struct list_head *images; - td_disk_info_t *parent_info = NULL; + driver->info = parent->driver->info; + log->driver = driver; - if (list_empty(&vbd->driver_stack)) - return -ENOENT; - - flags = (vbd->flags & ~TD_OPEN_SHAREABLE) | extra_flags; - - /* loop on each user specified driver. - * NOTE: driver_info is in reverse order. That is, the first - * item is the 'parent' or 'sink' driver */ - list_for_each_entry(driver_info, &vbd->driver_stack, next) { - LIST_HEAD(images); - - err = tapdisk_vbd_open_level(vbd, &images, - driver_info->params, - driver_info->type, - parent_info, flags); - if (err) - goto fail; - - /* after each loop, - * append the created stack to the result stack */ - list_splice(&images, &vbd->images); - - /* set the parent_info to the first diskinfo on the stack */ - tmp = tapdisk_vbd_first_image(vbd); - parent_info = &tmp->info; - } - - if (td_flag_test(vbd->flags, TD_OPEN_LOG_DIRTY)) { - err = tapdisk_vbd_add_dirty_log(vbd); - if (err) - goto fail; - } - - if (td_flag_test(vbd->flags, TD_OPEN_ADD_CACHE)) { - err = tapdisk_vbd_add_block_cache(vbd); - if (err) - goto fail; - } - - err = tapdisk_vbd_validate_chain(vbd); + err = td_open(log); if (err) goto fail; - td_flag_clear(vbd->state, TD_VBD_CLOSED); - - return 0; - -fail: - tapdisk_vbd_close_vdi(vbd); - return err; -} - -/* this populates a vbd type based on path */ -int -tapdisk_vbd_parse_stack(td_vbd_t *vbd, const char *path) -{ - int err; - char *params, *driver_str; - td_vbd_driver_info_t *driver; - - err = tapdisk_namedup(¶ms, path); - if (err) - return err; - - /* tokenize params based on pipe '|' */ - driver_str = strtok(params, "|"); - while (driver_str != NULL) { - const char *path; - int type; - - /* parse driver info and add to vbd */ - driver = calloc(1, sizeof(td_vbd_driver_info_t)); - if (!driver) { - PERROR("malloc"); - err = -errno; - goto out; - } - INIT_LIST_HEAD(&driver->next); - - err = tapdisk_parse_disk_type(driver_str, &path, &type); - if (err) { - free(driver); - goto out; - } - - driver->type = type; - driver->params = strdup(path); - if (!driver->params) { - err = -ENOMEM; - free(driver); - goto out; - } - - /* build the list backwards as the last driver will be the - * first driver to open in the stack */ - list_add(&driver->next, &vbd->driver_stack); - - /* get next driver string */ - driver_str = strtok(NULL, "|"); - } - -out: - free(params); - if (err) - tapdisk_vbd_free_stack(vbd); - - return err; -} - -void -tapdisk_vbd_free_stack(td_vbd_t *vbd) -{ - td_vbd_driver_info_t *driver; - - while (!list_empty(&vbd->driver_stack)) { - driver = list_entry(vbd->driver_stack.next, - td_vbd_driver_info_t, next); - list_del(&driver->next); - free(driver->params); - free(driver); - } -} - -/* NOTE: driver type, etc. must be set */ -int -tapdisk_vbd_open_stack(td_vbd_t *vbd, uint16_t storage, td_flag_t flags) -{ - int i, err; - - vbd->flags = flags; - vbd->storage = storage; - - for (i = 0; i < TD_VBD_EIO_RETRIES; i++) { - err = __tapdisk_vbd_open_vdi(vbd, 0); - if (err != -EIO) - break; - - sleep(TD_VBD_EIO_SLEEP); - } - if (err) - goto fail; - + tapdisk_vbd_add_image(vbd, log); return 0; fail: + tapdisk_image_free(log, NULL); return err; } int -tapdisk_vbd_open_vdi(td_vbd_t *vbd, const char *path, - uint16_t drivertype, uint16_t storage, td_flag_t flags) +tapdisk_vbd_open_vdi(td_vbd_t * vbd, const char *params, td_flag_t flags, + const char * prt_path) { - int i, err; - const struct tap_disk *ops; + char *tmp = vbd->name; + int err; - ops = tapdisk_disk_drivers[drivertype]; - if (!ops) - return -EINVAL; - DPRINTF("Loaded %s driver for vbd %u %s 0x%08x\n", - ops->disk_type, vbd->uuid, path, flags); + if (!TAILQ_EMPTY(&vbd->images)) { + err = -EBUSY; + goto fail; + } - err = tapdisk_namedup(&vbd->name, path); - if (err) - return err; + if (!params && !vbd->name) { + err = -EINVAL; + goto fail; + } - vbd->flags = flags; - vbd->storage = storage; + if (params) { + vbd->name = strdup(params); + if (!vbd->name) { + err = -errno; + goto fail; + } + } - for (i = 0; i < TD_VBD_EIO_RETRIES; i++) { - err = __tapdisk_vbd_open_vdi(vbd, 0); - if (err != -EIO) - break; + err = tapdisk_image_open_chain(vbd->name, flags, prt_path, &vbd->images); + if (err) + goto fail; + assert(!TAILQ_EMPTY(&vbd->images)); - sleep(TD_VBD_EIO_SLEEP); - } - if (err) - goto fail; + td_flag_clear(vbd->state, TD_VBD_CLOSED); + vbd->flags = flags; - return 0; + if (td_flag_test(vbd->flags, TD_OPEN_LOG_DIRTY)) { + err = tapdisk_vbd_add_dirty_log(vbd); + if (err) + goto fail; + } -fail: - free(vbd->name); - vbd->name = NULL; - return err; -} - -static int -tapdisk_vbd_register_event_watches(td_vbd_t *vbd) -{ - event_id_t id; - - id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, - vbd->ring.fd, 0, - tapdisk_vbd_ring_event, vbd); - if (id < 0) - return id; - - vbd->ring_event_id = id; - - return 0; -} - -static void -tapdisk_vbd_unregister_events(td_vbd_t *vbd) -{ - if (vbd->ring_event_id) - tapdisk_server_unregister_event(vbd->ring_event_id); -} - -static int -tapdisk_vbd_map_device(td_vbd_t *vbd, const char *devname) -{ - - int err, psize; - td_ring_t *ring; - - ring = &vbd->ring; - psize = getpagesize(); - - ring->fd = open(devname, O_RDWR); - if (ring->fd == -1) { - err = -errno; - EPRINTF("failed to open %s: %d\n", devname, err); + if (td_flag_test(vbd->flags, TD_OPEN_ADD_CACHE)) { + err = tapdisk_vbd_add_block_cache(vbd); + if (err) goto fail; } - ring->mem = mmap(0, psize * BLKTAP_MMAP_REGION_SIZE, - PROT_READ | PROT_WRITE, MAP_SHARED, ring->fd, 0); - if (ring->mem == MAP_FAILED) { - err = -errno; - EPRINTF("failed to mmap %s: %d\n", devname, err); + if (td_flag_test(vbd->flags, TD_OPEN_LOCAL_CACHE)) { + err = tapdisk_vbd_add_local_cache(vbd); + if (err) goto fail; } - ring->sring = (blkif_sring_t *)((unsigned long)ring->mem); - BACK_RING_INIT(&ring->fe_ring, ring->sring, psize); + err = tapdisk_vbd_validate_chain(vbd); + if (err) + goto fail; - ring->vstart = - (unsigned long)ring->mem + (BLKTAP_RING_PAGES * psize); + if (td_flag_test(vbd->flags, TD_OPEN_SECONDARY)) { + err = tapdisk_vbd_add_secondary(vbd); + if (err) + goto fail; + } - ioctl(ring->fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE); + if (tmp != vbd->name) + free(tmp); - return 0; + return err; fail: - if (ring->mem && ring->mem != MAP_FAILED) - munmap(ring->mem, psize * BLKTAP_MMAP_REGION_SIZE); - if (ring->fd != -1) - close(ring->fd); - ring->fd = -1; - ring->mem = NULL; - return err; -} + if (vbd->name != tmp) { + free(vbd->name); + vbd->name = tmp; + } -static int -tapdisk_vbd_unmap_device(td_vbd_t *vbd) -{ - int psize; + if (!TAILQ_EMPTY(&vbd->images)) + tapdisk_image_close_chain(&vbd->images); - psize = getpagesize(); + vbd->flags = 0; - if (vbd->ring.fd != -1) - close(vbd->ring.fd); - if (vbd->ring.mem > 0) - munmap(vbd->ring.mem, psize * BLKTAP_MMAP_REGION_SIZE); - - return 0; -} - -void -tapdisk_vbd_detach(td_vbd_t *vbd) -{ - tapdisk_vbd_unregister_events(vbd); - - tapdisk_vbd_unmap_device(vbd); - vbd->minor = -1; -} - - -int -tapdisk_vbd_attach(td_vbd_t *vbd, const char *devname, int minor) -{ - int err; - - err = tapdisk_vbd_map_device(vbd, devname); - if (err) - goto fail; - - err = tapdisk_vbd_register_event_watches(vbd); - if (err) - goto fail; - - vbd->minor = minor; - - return 0; - -fail: - tapdisk_vbd_detach(vbd); - - return err; -} - -int -tapdisk_vbd_open(td_vbd_t *vbd, const char *name, uint16_t type, - uint16_t storage, int minor, const char *ring, td_flag_t flags) -{ - int err; - - err = tapdisk_vbd_open_stack(vbd, storage, flags); - if (err) - goto out; - - err = tapdisk_vbd_attach(vbd, ring, minor); - if (err) - goto out; - - return 0; - -out: - tapdisk_vbd_detach(vbd); - tapdisk_vbd_close_vdi(vbd); - free(vbd->name); - vbd->name = NULL; - return err; + return err; } static void @@ -745,44 +518,39 @@ tapdisk_vbd_queue_count(td_vbd_t *vbd, i *completed = c; } -static int -tapdisk_vbd_shutdown(td_vbd_t *vbd) +static int tapdisk_vbd_shutdown(td_vbd_t * vbd) { int new, pending, failed, completed; - if (!list_empty(&vbd->pending_requests)) + if (!TAILQ_EMPTY(&vbd->pending_requests)) return -EAGAIN; - tapdisk_vbd_kick(vbd); tapdisk_vbd_queue_count(vbd, &new, &pending, &failed, &completed); DPRINTF("%s: state: 0x%08x, new: 0x%02x, pending: 0x%02x, " "failed: 0x%02x, completed: 0x%02x\n", vbd->name, vbd->state, new, pending, failed, completed); - DPRINTF("last activity: %010ld.%06lld, errors: 0x%04"PRIx64", " + DPRINTF("last activity: %010ld.%06ld, errors: 0x%04" PRIx64 ", " "retries: 0x%04"PRIx64", received: 0x%08"PRIx64", " "returned: 0x%08"PRIx64", kicked: 0x%08"PRIx64"\n", - vbd->ts.tv_sec, (unsigned long long)vbd->ts.tv_usec, + vbd->ts.tv_sec, vbd->ts.tv_usec, vbd->errors, vbd->retries, vbd->received, vbd->returned, vbd->kicked); tapdisk_vbd_close_vdi(vbd); - tapdisk_vbd_detach(vbd); tapdisk_server_remove_vbd(vbd); - tapdisk_vbd_free(vbd); - - tlog_print_errors(); + free(vbd->name); + free(vbd); return 0; } -int -tapdisk_vbd_close(td_vbd_t *vbd) +int tapdisk_vbd_close(td_vbd_t * vbd) { /* * don't close if any requests are pending in the aio layer */ - if (!list_empty(&vbd->pending_requests)) + if (!TAILQ_EMPTY(&vbd->pending_requests)) goto fail; /* @@ -790,9 +558,9 @@ tapdisk_vbd_close(td_vbd_t *vbd) * requests, try to complete them before closing. */ if (tapdisk_vbd_queue_ready(vbd) && - (!list_empty(&vbd->new_requests) || - !list_empty(&vbd->failed_requests) || - !list_empty(&vbd->completed_requests))) + (!TAILQ_EMPTY(&vbd->new_requests) || + !TAILQ_EMPTY(&vbd->failed_requests) || + !TAILQ_EMPTY(&vbd->completed_requests))) goto fail; return tapdisk_vbd_shutdown(vbd); @@ -807,8 +575,7 @@ fail: * control operations */ -void -tapdisk_vbd_debug(td_vbd_t *vbd) +void tapdisk_vbd_debug(td_vbd_t * vbd) { td_image_t *image, *tmp; int new, pending, failed, completed; @@ -816,49 +583,41 @@ tapdisk_vbd_debug(td_vbd_t *vbd) tapdisk_vbd_queue_count(vbd, &new, &pending, &failed, &completed); DBG(TLOG_WARN, "%s: state: 0x%08x, new: 0x%02x, pending: 0x%02x, " - "failed: 0x%02x, completed: 0x%02x, last activity: %010ld.%06lld, " - "errors: 0x%04"PRIx64", retries: 0x%04"PRIx64", received: 0x%08"PRIx64", " - "returned: 0x%08"PRIx64", kicked: 0x%08"PRIx64"\n", + "failed: 0x%02x, completed: 0x%02x, last activity: %010ld.%06ld, " + "errors: 0x%04" PRIx64 ", retries: 0x%04" PRIx64 ", " + "received: 0x%08" PRIx64 ", returned: 0x%08" PRIx64 ", " + "kicked: 0x%08" PRIx64 "\n", vbd->name, vbd->state, new, pending, failed, completed, - vbd->ts.tv_sec, (unsigned long long)vbd->ts.tv_usec, - vbd->errors, vbd->retries, + vbd->ts.tv_sec, vbd->ts.tv_usec, vbd->errors, vbd->retries, vbd->received, vbd->returned, vbd->kicked); tapdisk_vbd_for_each_image(vbd, image, tmp) td_debug(image); } -static void -tapdisk_vbd_drop_log(td_vbd_t *vbd) +static void tapdisk_vbd_drop_log(td_vbd_t * vbd) { if (td_flag_test(vbd->state, TD_VBD_LOG_DROPPED)) return; tapdisk_vbd_debug(vbd); - tlog_flush(); + tlog_precious(); td_flag_set(vbd->state, TD_VBD_LOG_DROPPED); } -int -tapdisk_vbd_get_image_info(td_vbd_t *vbd, image_t *img) +int tapdisk_vbd_get_disk_info(td_vbd_t * vbd, td_disk_info_t * info) { - td_image_t *image; + if (TAILQ_EMPTY(&vbd->images)) + { + EPRINTF("no images\n"); + return -EINVAL; + } - memset(img, 0, sizeof(image_t)); - - if (list_empty(&vbd->images)) - return -EINVAL; - - image = tapdisk_vbd_first_image(vbd); - img->size = image->info.size; - img->secsize = image->info.sector_size; - img->info = image->info.info; - + *info = tapdisk_vbd_first_image(vbd)->info; return 0; } -int -tapdisk_vbd_queue_ready(td_vbd_t *vbd) +static int tapdisk_vbd_queue_ready(td_vbd_t * vbd) { return (!td_flag_test(vbd->state, TD_VBD_DEAD) && !td_flag_test(vbd->state, TD_VBD_CLOSED) && @@ -866,22 +625,15 @@ tapdisk_vbd_queue_ready(td_vbd_t *vbd) !td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED)); } -int -tapdisk_vbd_retry_needed(td_vbd_t *vbd) +int tapdisk_vbd_retry_needed(td_vbd_t * vbd) { - return td_flag_test(vbd->state, TD_VBD_RETRY_NEEDED); + return !(TAILQ_EMPTY(&vbd->failed_requests) && + TAILQ_EMPTY(&vbd->new_requests)); } -int -tapdisk_vbd_lock(td_vbd_t *vbd) +int tapdisk_vbd_quiesce_queue(td_vbd_t * vbd) { - return 0; -} - -int -tapdisk_vbd_quiesce_queue(td_vbd_t *vbd) -{ - if (!list_empty(&vbd->pending_requests)) { + if (!TAILQ_EMPTY(&vbd->pending_requests)) { td_flag_set(vbd->state, TD_VBD_QUIESCE_REQUESTED); return -EAGAIN; } @@ -891,24 +643,24 @@ tapdisk_vbd_quiesce_queue(td_vbd_t *vbd) return 0; } -int -tapdisk_vbd_start_queue(td_vbd_t *vbd) +int tapdisk_vbd_start_queue(td_vbd_t * vbd) { td_flag_clear(vbd->state, TD_VBD_QUIESCED); td_flag_clear(vbd->state, TD_VBD_QUIESCE_REQUESTED); + tapdisk_vbd_mark_progress(vbd); return 0; } -int -tapdisk_vbd_kill_queue(td_vbd_t *vbd) +int tapdisk_vbd_kill_queue(td_vbd_t * vbd) { tapdisk_vbd_quiesce_queue(vbd); td_flag_set(vbd->state, TD_VBD_DEAD); return 0; } -static int -tapdisk_vbd_open_image(td_vbd_t *vbd, td_image_t *image) +/* XXX This is commented out in blktap2.5. */ +#if 0 +static int tapdisk_vbd_open_image(td_vbd_t * vbd, td_image_t * image) { int err; td_image_t *parent; @@ -928,33 +680,14 @@ tapdisk_vbd_open_image(td_vbd_t *vbd, td return 0; } +#endif -static int -tapdisk_vbd_close_and_reopen_image(td_vbd_t *vbd, td_image_t *image) -{ - int i, err; - - td_close(image); - - for (i = 0; i < TD_VBD_EIO_RETRIES; i++) { - err = tapdisk_vbd_open_image(vbd, image); - if (err != -EIO) - break; - - sleep(TD_VBD_EIO_SLEEP); - } - - if (err) - td_flag_set(vbd->state, TD_VBD_CLOSED); - - return err; -} - -int -tapdisk_vbd_pause(td_vbd_t *vbd) +int tapdisk_vbd_pause(td_vbd_t * vbd) { int err; + DBG(TLOG_DBG, "pause requested\n"); + td_flag_set(vbd->state, TD_VBD_PAUSE_REQUESTED); err = tapdisk_vbd_quiesce_queue(vbd); @@ -963,34 +696,29 @@ tapdisk_vbd_pause(td_vbd_t *vbd) tapdisk_vbd_close_vdi(vbd); + DBG(TLOG_DBG, "pause completed\n"); + td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED); td_flag_set(vbd->state, TD_VBD_PAUSED); return 0; } -int -tapdisk_vbd_resume(td_vbd_t *vbd, const char *path, uint16_t drivertype) +int tapdisk_vbd_resume(td_vbd_t * vbd, const char *name) { int i, err; + DBG(TLOG_DBG, "resume requested\n"); + if (!td_flag_test(vbd->state, TD_VBD_PAUSED)) { EPRINTF("resume request for unpaused vbd %s\n", vbd->name); return -EINVAL; } - if (path) { - free(vbd->name); - vbd->name = strdup(path); - if (!vbd->name) { - EPRINTF("copying new vbd %s name failed\n", path); - return -EINVAL; - } - } - for (i = 0; i < TD_VBD_EIO_RETRIES; i++) { - err = __tapdisk_vbd_open_vdi(vbd, TD_OPEN_STRICT); - if (err != -EIO) + err = + tapdisk_vbd_open_vdi(vbd, name, vbd->flags | TD_OPEN_STRICT, NULL); + if (!err) break; sleep(TD_VBD_EIO_SLEEP); @@ -999,101 +727,68 @@ tapdisk_vbd_resume(td_vbd_t *vbd, const if (err) return err; + DBG(TLOG_DBG, "resume completed\n"); + tapdisk_vbd_start_queue(vbd); td_flag_clear(vbd->state, TD_VBD_PAUSED); td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED); tapdisk_vbd_check_state(vbd); + DBG(TLOG_DBG, "state checked\n"); + return 0; } -int -tapdisk_vbd_kick(td_vbd_t *vbd) +static int +tapdisk_vbd_request_ttl(td_vbd_request_t * vreq, const struct timeval *now) { - int n; - td_ring_t *ring; - - tapdisk_vbd_check_state(vbd); - - ring = &vbd->ring; - if (!ring->sring) - return 0; - - n = (ring->fe_ring.rsp_prod_pvt - ring->fe_ring.sring->rsp_prod); - if (!n) - return 0; - - vbd->kicked += n; - RING_PUSH_RESPONSES(&ring->fe_ring); - ioctl(ring->fd, BLKTAP_IOCTL_KICK_FE, 0); - - DBG(TLOG_INFO, "kicking %d: rec: 0x%08"PRIx64", ret: 0x%08"PRIx64", kicked: " - "0x%08"PRIx64"\n", n, vbd->received, vbd->returned, vbd->kicked); - - return n; + struct timeval delta; + timersub(now, &vreq->ts, &delta); + return TD_VBD_REQUEST_TIMEOUT - delta.tv_sec; } -static inline void -tapdisk_vbd_write_response_to_ring(td_vbd_t *vbd, blkif_response_t *rsp) +static int +__tapdisk_vbd_request_timeout(td_vbd_request_t * vreq, + const struct timeval *now) { - td_ring_t *ring; - blkif_response_t *rspp; + int timeout; - ring = &vbd->ring; - rspp = RING_GET_RESPONSE(&ring->fe_ring, ring->fe_ring.rsp_prod_pvt); - memcpy(rspp, rsp, sizeof(blkif_response_t)); - ring->fe_ring.rsp_prod_pvt++; + timeout = tapdisk_vbd_request_ttl(vreq, now) < 0; + if (timeout) + ERR(vreq->error, + "req %s timed out, retried %d times\n", + vreq->name, vreq->num_retries); + + return timeout; } -static void -tapdisk_vbd_callback(void *arg, blkif_response_t *rsp) +static int tapdisk_vbd_request_timeout(td_vbd_request_t * vreq) { - td_vbd_t *vbd = (td_vbd_t *)arg; - tapdisk_vbd_write_response_to_ring(vbd, rsp); + struct timeval now; + gettimeofday(&now, NULL); + return __tapdisk_vbd_request_timeout(vreq, &now); } -static void -tapdisk_vbd_make_response(td_vbd_t *vbd, td_vbd_request_t *vreq) -{ - blkif_request_t tmp; - blkif_response_t *rsp; - - tmp = vreq->req; - rsp = (blkif_response_t *)&vreq->req; - - rsp->id = tmp.id; - rsp->operation = tmp.operation; - rsp->status = vreq->status; - - DBG(TLOG_DBG, "writing req %d, sec 0x%08"PRIx64", res %d to ring\n", - (int)tmp.id, tmp.sector_number, vreq->status); - - if (rsp->status != BLKIF_RSP_OKAY) - ERR(EIO, "returning BLKIF_RSP %d", rsp->status); - - vbd->returned++; - vbd->callback(vbd->argument, rsp); -} - -void -tapdisk_vbd_check_state(td_vbd_t *vbd) +static void tapdisk_vbd_check_queue_state(td_vbd_t * vbd) { td_vbd_request_t *vreq, *tmp; + struct timeval now; + gettimeofday(&now, NULL); tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) - if (vreq->num_retries >= TD_VBD_MAX_RETRIES) + if (__tapdisk_vbd_request_timeout(vreq, &now)) tapdisk_vbd_complete_vbd_request(vbd, vreq); - if (!list_empty(&vbd->new_requests) || - !list_empty(&vbd->failed_requests)) + if (!TAILQ_EMPTY(&vbd->new_requests) || + !TAILQ_EMPTY(&vbd->failed_requests)) tapdisk_vbd_issue_requests(vbd); - tapdisk_vbd_for_each_request(vreq, tmp, &vbd->completed_requests) { - tapdisk_vbd_make_response(vbd, vreq); - list_del(&vreq->next); - tapdisk_vbd_initialize_vreq(vreq); } +void tapdisk_vbd_check_state(td_vbd_t * vbd) +{ + tapdisk_vbd_check_queue_state(vbd); + if (td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED)) tapdisk_vbd_quiesce_queue(vbd); @@ -1104,21 +799,21 @@ tapdisk_vbd_check_state(td_vbd_t *vbd) tapdisk_vbd_close(vbd); } -void -tapdisk_vbd_check_progress(td_vbd_t *vbd) +void tapdisk_vbd_check_progress(td_vbd_t * vbd) { - int diff; - struct timeval now; + time_t diff; + struct timeval now, delta; - if (list_empty(&vbd->pending_requests)) + if (TAILQ_EMPTY(&vbd->pending_requests)) return; gettimeofday(&now, NULL); - diff = now.tv_sec - vbd->ts.tv_sec; + timersub(&now, &vbd->ts, &delta); + diff = delta.tv_sec; - if (diff >= TD_VBD_WATCHDOG_TIMEOUT) { + if (diff >= TD_VBD_WATCHDOG_TIMEOUT && tapdisk_vbd_queue_ready(vbd)) { DBG(TLOG_WARN, "%s: watchdog timeout: pending requests " - "idle for %d seconds\n", vbd->name, diff); + "idle for %ld seconds\n", vbd->name, diff); tapdisk_vbd_drop_log(vbd); return; } @@ -1130,106 +825,90 @@ tapdisk_vbd_check_progress(td_vbd_t *vbd * request submission */ -static int -tapdisk_vbd_check_queue(td_vbd_t *vbd) +static int tapdisk_vbd_check_queue(td_vbd_t * vbd) { - int err; - td_image_t *image; - - if (list_empty(&vbd->images)) + if (TAILQ_EMPTY(&vbd->images)) return -ENOSYS; if (!tapdisk_vbd_queue_ready(vbd)) return -EAGAIN; - if (!vbd->reopened) { - if (td_flag_test(vbd->state, TD_VBD_LOCKING)) { - err = tapdisk_vbd_lock(vbd); - if (err) - return err; + return 0; } - image = tapdisk_vbd_first_image(vbd); - td_flag_set(image->flags, TD_OPEN_STRICT); +static int +tapdisk_vbd_request_should_retry(td_vbd_t * vbd, td_vbd_request_t * vreq) +{ + if (td_flag_test(vbd->state, TD_VBD_DEAD) || + td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED)) + return 0; - if (tapdisk_vbd_close_and_reopen_image(vbd, image)) - EPRINTF("reopening disks failed\n"); - else { - DPRINTF("reopening disks succeeded\n"); - vbd->reopened = 1; - } + switch (abs(vreq->error)) { + case EPERM: + case ENOSYS: + case ESTALE: + case ENOSPC: + return 0; } + if (tapdisk_vbd_request_timeout(vreq)) return 0; + + return 1; } -void +static void tapdisk_vbd_complete_vbd_request(td_vbd_t *vbd, td_vbd_request_t *vreq) { if (!vreq->submitting && !vreq->secs_pending) { - if (vreq->status == BLKIF_RSP_ERROR && - vreq->num_retries < TD_VBD_MAX_RETRIES && - !td_flag_test(vbd->state, TD_VBD_DEAD) && - !td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED)) + if (vreq->error && tapdisk_vbd_request_should_retry(vbd, vreq)) tapdisk_vbd_move_request(vreq, &vbd->failed_requests); else tapdisk_vbd_move_request(vreq, &vbd->completed_requests); } } -static uint64_t -tapdisk_vbd_breq_get_sector(blkif_request_t *breq, td_request_t treq) +static void +FIXME_maybe_count_enospc_redirect(td_vbd_t * vbd, td_request_t treq) { - int seg, nsects; - uint64_t sector_nr = breq->sector_number; - - for(seg=0; seg < treq.sidx; seg++) { - nsects = breq->seg[seg].last_sect - breq->seg[seg].first_sect + 1; - sector_nr += nsects; - } - - return sector_nr; + int write = treq.op == TD_OP_WRITE; + if (write && + treq.image == tapdisk_vbd_first_image(vbd) && + vbd->FIXME_enospc_redirect_count_enabled) + vbd->FIXME_enospc_redirect_count += treq.secs; } static void __tapdisk_vbd_complete_td_request(td_vbd_t *vbd, td_vbd_request_t *vreq, td_request_t treq, int res) { - int err; td_image_t *image = treq.image; + int err; err = (res <= 0 ? res : -res); vbd->secs_pending -= treq.secs; vreq->secs_pending -= treq.secs; - vreq->blocked = treq.blocked; + if (err != -EBUSY) { + int write = treq.op == TD_OP_WRITE; + td_sector_count_add(&image->stats.hits, treq.secs, write); + if (err) + td_sector_count_add(&image->stats.fail, treq.secs, write); + + FIXME_maybe_count_enospc_redirect(vbd, treq); + } if (err) { - vreq->status = BLKIF_RSP_ERROR; - vreq->error = (vreq->error ? : err); if (err != -EBUSY) { - vbd->errors++; - ERR(err, "req %"PRIu64": %s 0x%04x secs to " - "0x%08"PRIx64, vreq->req.id, + if (!vreq->error && err != vreq->prev_error) + tlog_drv_error(image->driver, err, + "req %s: %s 0x%04x secs @ 0x%08" PRIx64, + vreq->name, (treq.op == TD_OP_WRITE ? "write" : "read"), treq.secs, treq.sec); + vbd->errors++; } - } else { -#ifdef MEMSHR - if (treq.op == TD_OP_READ - && td_flag_test(image->flags, TD_OPEN_RDONLY)) { - share_tuple_t hnd = treq.memshr_hnd; - uint16_t uid = image->memshr_id; - blkif_request_t *breq = &vreq->req; - uint64_t sec = tapdisk_vbd_breq_get_sector(breq, treq); - int secs = breq->seg[treq.sidx].last_sect - - breq->seg[treq.sidx].first_sect + 1; - - if (hnd.handle != 0) - memshr_vbd_complete_ro_request(hnd, uid, - sec, secs); - } -#endif + vreq->error = (vreq->error ? : err); } tapdisk_vbd_complete_vbd_request(vbd, vreq); @@ -1242,7 +921,7 @@ static void td_image_t *parent; td_vbd_request_t *vreq; - vreq = (td_vbd_request_t *)treq.private; + vreq = treq.vreq; gettimeofday(&vreq->last_try, NULL); vreq->submitting++; @@ -1282,27 +961,6 @@ static void break; case TD_OP_READ: -#ifdef MEMSHR - if(td_flag_test(parent->flags, TD_OPEN_RDONLY)) { - int ret, seg = treq.sidx; - blkif_request_t *breq = &vreq->req; - - ret = memshr_vbd_issue_ro_request(treq.buf, - breq->seg[seg].gref, - parent->memshr_id, - treq.sec, - treq.secs, - &treq.memshr_hnd); - if(ret == 0) { - /* Reset memshr handle. This'll prevent - * memshr_vbd_complete_ro_request being called - */ - treq.memshr_hnd.handle = 0; - td_complete_request(treq, 0); - } else - td_queue_read(parent, treq); - } else -#endif td_queue_read(parent, treq); break; } @@ -1313,114 +971,144 @@ done: tapdisk_vbd_complete_vbd_request(vbd, vreq); } -void -tapdisk_vbd_forward_request(td_request_t treq) +void tapdisk_vbd_forward_request(td_request_t treq) { td_vbd_t *vbd; td_image_t *image; td_vbd_request_t *vreq; image = treq.image; - vbd = (td_vbd_t *)image->private; - vreq = (td_vbd_request_t *)treq.private; + vreq = treq.vreq; + vbd = vreq->vbd; - gettimeofday(&vbd->ts, NULL); + tapdisk_vbd_mark_progress(vbd); if (tapdisk_vbd_queue_ready(vbd)) __tapdisk_vbd_reissue_td_request(vbd, image, treq); else - __tapdisk_vbd_complete_td_request(vbd, vreq, treq, -EIO); + __tapdisk_vbd_complete_td_request(vbd, vreq, treq, -EBUSY); } -static void -tapdisk_vbd_complete_td_request(td_request_t treq, int res) +void tapdisk_vbd_complete_td_request(td_request_t treq, int res) { td_vbd_t *vbd; - td_image_t *image; + td_image_t *image, *leaf; td_vbd_request_t *vreq; image = treq.image; - vbd = (td_vbd_t *)image->private; - vreq = (td_vbd_request_t *)treq.private; + vreq = treq.vreq; + vbd = vreq->vbd; - gettimeofday(&vbd->ts, NULL); - DBG(TLOG_DBG, "%s: req %d seg %d sec 0x%08"PRIx64" " + tapdisk_vbd_mark_progress(vbd); + + if (abs(res) == ENOSPC && td_flag_test(image->flags, TD_IGNORE_ENOSPC)) { + res = 0; + leaf = tapdisk_vbd_first_image(vbd); + if (vbd->secondary_mode == TD_VBD_SECONDARY_MIRROR) { + DPRINTF("ENOSPC: disabling mirroring\n"); + TAILQ_REMOVE(&vbd->images, leaf, entry); + vbd->retired = leaf; + } else if (vbd->secondary_mode == TD_VBD_SECONDARY_STANDBY) { + DPRINTF("ENOSPC: failing over to secondary image\n"); + TAILQ_INSERT_BEFORE(leaf, vbd->secondary, entry); + vbd->FIXME_enospc_redirect_count_enabled = 1; + } + if (vbd->secondary_mode != TD_VBD_SECONDARY_DISABLED) { + vbd->secondary = NULL; + vbd->secondary_mode = TD_VBD_SECONDARY_DISABLED; + signal_enospc(vbd); + } + } + + DBG(TLOG_DBG, "%s: req %s seg %d sec 0x%08" PRIx64 "secs 0x%04x buf %p op %d res %d\n", image->name, - (int)treq.id, treq.sidx, treq.sec, treq.secs, - treq.buf, (int)vreq->req.operation, res); + vreq->name, treq.sidx, treq.sec, treq.secs, + treq.buf, vreq->op, res); __tapdisk_vbd_complete_td_request(vbd, vreq, treq, res); } +static inline void queue_mirror_req(td_vbd_t * vbd, td_request_t clone) +{ + clone.image = vbd->secondary; + td_queue_write(vbd->secondary, clone); +} + static int tapdisk_vbd_issue_request(td_vbd_t *vbd, td_vbd_request_t *vreq) { - char *page; - td_ring_t *ring; td_image_t *image; td_request_t treq; - uint64_t sector_nr; - blkif_request_t *req; - int i, err, id, nsects; + td_sector_t sec; + int i, err; - req = &vreq->req; - id = req->id; - ring = &vbd->ring; - sector_nr = req->sector_number; + sec = vreq->sec; image = tapdisk_vbd_first_image(vbd); vreq->submitting = 1; - gettimeofday(&vbd->ts, NULL); - gettimeofday(&vreq->last_try, NULL); + + tapdisk_vbd_mark_progress(vbd); + vreq->last_try = vbd->ts; + tapdisk_vbd_move_request(vreq, &vbd->pending_requests); -#if 0 err = tapdisk_vbd_check_queue(vbd); - if (err) + if (err) { + vreq->error = err; goto fail; -#endif + } - err = tapdisk_image_check_ring_request(image, req); - if (err) + err = tapdisk_image_check_request(image, vreq); + if (err) { + vreq->error = err; goto fail; + } - for (i = 0; i < req->nr_segments; i++) { - nsects = req->seg[i].last_sect - req->seg[i].first_sect + 1; - page = (char *)MMAP_VADDR(ring->vstart, - (unsigned long)req->id, i); - page += (req->seg[i].first_sect << SECTOR_SHIFT); + for (i = 0; i < vreq->iovcnt; i++) { + struct td_iovec *iov = &vreq->iov[i]; - treq.id = id; treq.sidx = i; - treq.blocked = 0; - treq.buf = page; - treq.sec = sector_nr; - treq.secs = nsects; + treq.buf = iov->base; + treq.sec = sec; + treq.secs = iov->secs; treq.image = image; treq.cb = tapdisk_vbd_complete_td_request; treq.cb_data = NULL; - treq.private = vreq; + treq.vreq = vreq; - DBG(TLOG_DBG, "%s: req %d seg %d sec 0x%08"PRIx64" secs 0x%04x " - "buf %p op %d\n", image->name, id, i, treq.sec, treq.secs, - treq.buf, (int)req->operation); - vreq->secs_pending += nsects; - vbd->secs_pending += nsects; + vreq->secs_pending += iov->secs; + vbd->secs_pending += iov->secs; + if (vbd->secondary_mode == TD_VBD_SECONDARY_MIRROR && + vreq->op == TD_OP_WRITE) { + vreq->secs_pending += iov->secs; + vbd->secs_pending += iov->secs; + } - switch (req->operation) { - case BLKIF_OP_WRITE: + switch (vreq->op) { + case TD_OP_WRITE: treq.op = TD_OP_WRITE; - td_queue_write(image, treq); + /* it's important to queue the mirror request before queuing + * the main one. If the main image runs into ENOSPC, the + * mirroring could be disabled before td_queue_write returns, + * so if the mirror request was queued after (which would then + * not happen), we'd lose that write and cause the process to + * hang with unacknowledged writes */ + if (vbd->secondary_mode == TD_VBD_SECONDARY_MIRROR) + queue_mirror_req(vbd, treq); + td_queue_write(treq.image, treq); break; - case BLKIF_OP_READ: + case TD_OP_READ: treq.op = TD_OP_READ; - td_queue_read(image, treq); + td_queue_read(treq.image, treq); break; } - sector_nr += nsects; + DBG(TLOG_DBG, "%s: req %s seg %d sec 0x%08" PRIx64 " secs 0x%04x " + "buf %p op %d\n", image->name, vreq->name, i, treq.sec, + treq.secs, treq.buf, vreq->op); + sec += iov->secs; } err = 0; @@ -1435,12 +1123,17 @@ out: return err; fail: - vreq->status = BLKIF_RSP_ERROR; + vreq->error = err; goto out; } static int -tapdisk_vbd_reissue_failed_requests(td_vbd_t *vbd) +tapdisk_vbd_request_completed(td_vbd_t * vbd, td_vbd_request_t * vreq) +{ + return vreq->list_head == &vbd->completed_requests; +} + +static int tapdisk_vbd_reissue_failed_requests(td_vbd_t * vbd) { int err; struct timeval now; @@ -1453,93 +1146,109 @@ tapdisk_vbd_reissue_failed_requests(td_v if (vreq->secs_pending) continue; - if (td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED)) - goto fail; + if (td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED)) { + tapdisk_vbd_complete_vbd_request(vbd, vreq); + continue; + } if (vreq->error != -EBUSY && now.tv_sec - vreq->last_try.tv_sec < TD_VBD_RETRY_INTERVAL) continue; - if (vreq->num_retries >= TD_VBD_MAX_RETRIES) { - fail: - DBG(TLOG_INFO, "req %"PRIu64"retried %d times\n", - vreq->req.id, vreq->num_retries); - tapdisk_vbd_complete_vbd_request(vbd, vreq); - continue; - } - - /* - * never fail due to too many retries if we are blocked on a - * dependency - */ - if (vreq->blocked) { - vreq->blocked = 0; - } else { vbd->retries++; vreq->num_retries++; - } + + vreq->prev_error = vreq->error; vreq->error = 0; - vreq->status = BLKIF_RSP_OKAY; - DBG(TLOG_DBG, "retry #%d of req %"PRIu64", " - "sec 0x%08"PRIx64", nr_segs: %d\n", vreq->num_retries, - vreq->req.id, vreq->req.sector_number, - vreq->req.nr_segments); + + DBG(TLOG_DBG, "retry #%d of req %s, " + "sec 0x%08" PRIx64 ", iovcnt: %d\n", vreq->num_retries, + vreq->name, vreq->sec, vreq->iovcnt); err = tapdisk_vbd_issue_request(vbd, vreq); - if (err) + /* + * if this request failed, but was not completed, + * we'll back off for a while. + */ + if (err && !tapdisk_vbd_request_completed(vbd, vreq)) break; } - if (list_empty(&vbd->failed_requests)) - td_flag_clear(vbd->state, TD_VBD_RETRY_NEEDED); - else - td_flag_set(vbd->state, TD_VBD_RETRY_NEEDED); - - return err; + return 0; } -static int -tapdisk_vbd_issue_new_requests(td_vbd_t *vbd) +static void +tapdisk_vbd_count_new_request(td_vbd_t * vbd, td_vbd_request_t * vreq) +{ + struct td_iovec *iov; + int write; + + write = vreq->op == TD_OP_WRITE; + + for (iov = &vreq->iov[0]; iov < &vreq->iov[vreq->iovcnt]; iov++) + td_sector_count_add(&vbd->secs, iov->secs, write); +} + +static int tapdisk_vbd_issue_new_requests(td_vbd_t * vbd) { int err; td_vbd_request_t *vreq, *tmp; tapdisk_vbd_for_each_request(vreq, tmp, &vbd->new_requests) { err = tapdisk_vbd_issue_request(vbd, vreq); - if (err) + /* + * if this request failed, but was not completed, + * we'll back off for a while. + */ + if (err && !tapdisk_vbd_request_completed(vbd, vreq)) return err; + + tapdisk_vbd_count_new_request(vbd, vreq); } return 0; } -static int -tapdisk_vbd_kill_requests(td_vbd_t *vbd) +int tapdisk_vbd_recheck_state(td_vbd_t * vbd) +{ + if (TAILQ_EMPTY(&vbd->new_requests)) + return 0; + + if (td_flag_test(vbd->state, TD_VBD_QUIESCED) || + td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED)) + return 0; + + tapdisk_vbd_issue_new_requests(vbd); + + return 1; +} + +static int tapdisk_vbd_kill_requests(td_vbd_t * vbd) { td_vbd_request_t *vreq, *tmp; tapdisk_vbd_for_each_request(vreq, tmp, &vbd->new_requests) { - vreq->status = BLKIF_RSP_ERROR; + vreq->error = -ESHUTDOWN; tapdisk_vbd_move_request(vreq, &vbd->completed_requests); } tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) { - vreq->status = BLKIF_RSP_ERROR; + vreq->error = -ESHUTDOWN; tapdisk_vbd_move_request(vreq, &vbd->completed_requests); } return 0; } -int -tapdisk_vbd_issue_requests(td_vbd_t *vbd) +int tapdisk_vbd_issue_requests(td_vbd_t * vbd) { int err; if (td_flag_test(vbd->state, TD_VBD_DEAD)) return tapdisk_vbd_kill_requests(vbd); - if (!tapdisk_vbd_queue_ready(vbd)) + if (td_flag_test(vbd->state, TD_VBD_QUIESCED) || + td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED)) return -EAGAIN; err = tapdisk_vbd_reissue_failed_requests(vbd); @@ -1549,175 +1258,71 @@ tapdisk_vbd_issue_requests(td_vbd_t *vbd return tapdisk_vbd_issue_new_requests(vbd); } -static void -tapdisk_vbd_pull_ring_requests(td_vbd_t *vbd) +int tapdisk_vbd_queue_request(td_vbd_t * vbd, td_vbd_request_t * vreq) { - int idx; - RING_IDX rp, rc; - td_ring_t *ring; - blkif_request_t *req; - td_vbd_request_t *vreq; - - ring = &vbd->ring; - if (!ring->sring) - return; - - rp = ring->fe_ring.sring->req_prod; - xen_rmb(); - - for (rc = ring->fe_ring.req_cons; rc != rp; rc++) { - req = RING_GET_REQUEST(&ring->fe_ring, rc); - ++ring->fe_ring.req_cons; - - idx = req->id; - vreq = &vbd->request_list[idx]; - - ASSERT(list_empty(&vreq->next)); - ASSERT(vreq->secs_pending == 0); - - memcpy(&vreq->req, req, sizeof(blkif_request_t)); - vbd->received++; + gettimeofday(&vreq->ts, NULL); vreq->vbd = vbd; - tapdisk_vbd_move_request(vreq, &vbd->new_requests); + vreq->list_head = &vbd->new_requests; + TAILQ_INSERT_TAIL(&vbd->new_requests, vreq, next); + vbd->received++; - DBG(TLOG_DBG, "%s: request %d \n", vbd->name, idx); + return 0; +} + +void tapdisk_vbd_kick(td_vbd_t * vbd) +{ + struct tqh_td_vbd_request *list = &vbd->completed_requests; + td_vbd_request_t *vreq, *prev, *next; + + vbd->kicked++; + + while (!TAILQ_EMPTY(list)) { + prev = TAILQ_FIRST(list); + TAILQ_REMOVE(list, prev, next); + + tapdisk_vbd_for_each_request(vreq, next, list) { + if (vreq->token == prev->token) { + + prev->cb(prev, prev->error, prev->token, 0); + vbd->returned++; + + TAILQ_REMOVE(list, vreq, next); + prev = vreq; + } + } + + prev->cb(prev, prev->error, prev->token, 1); + vbd->returned++; } } -static int -tapdisk_vbd_pause_ring(td_vbd_t *vbd) +void tapdisk_vbd_stats(td_vbd_t * vbd, td_stats_t * st) { - int err; + td_image_t *image, *next; - if (td_flag_test(vbd->state, TD_VBD_PAUSED)) - return 0; + tapdisk_stats_enter(st, '{'); + tapdisk_stats_field(st, "name", "s", vbd->name); - td_flag_set(vbd->state, TD_VBD_PAUSE_REQUESTED); + tapdisk_stats_field(st, "secs", "["); + tapdisk_stats_val(st, "llu", vbd->secs.rd); + tapdisk_stats_val(st, "llu", vbd->secs.wr); + tapdisk_stats_leave(st, ']'); - err = tapdisk_vbd_quiesce_queue(vbd); - if (err) { - EPRINTF("%s: ring pause request on active queue\n", vbd->name); - return err; + tapdisk_stats_field(st, "images", "["); + tapdisk_vbd_for_each_image(vbd, image, next) + tapdisk_image_stats(image, st); + tapdisk_stats_leave(st, ']'); + + if (vbd->tap) { + tapdisk_stats_field(st, "tap", "{"); + tapdisk_xenblkif_stats(vbd->tap, st); + tapdisk_stats_leave(st, '}'); } - tapdisk_vbd_close_vdi(vbd); + tapdisk_stats_field(st, + "FIXME_enospc_redirect_count", + "llu", vbd->FIXME_enospc_redirect_count); - err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_PAUSE, 0); - if (err) - EPRINTF("%s: pause ioctl failed: %d\n", vbd->name, errno); - else { - td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED); - td_flag_set(vbd->state, TD_VBD_PAUSED); - } - - return err; + tapdisk_stats_leave(st, '}'); } - -static int -tapdisk_vbd_resume_ring(td_vbd_t *vbd) -{ - int i, err, type; - char message[BLKTAP2_MAX_MESSAGE_LEN]; - const char *path; - - memset(message, 0, sizeof(message)); - - if (!td_flag_test(vbd->state, TD_VBD_PAUSED)) { - EPRINTF("%s: resume message for unpaused vbd\n", vbd->name); - return -EINVAL; - } - - err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_REOPEN, &message); - if (err) { - EPRINTF("%s: resume ioctl failed: %d\n", vbd->name, errno); - return err; - } - - err = tapdisk_parse_disk_type(message, &path, &type); - if (err) { - EPRINTF("%s: invalid resume string %s\n", vbd->name, message); - goto out; - } - - free(vbd->name); - vbd->name = strdup(path); - if (!vbd->name) { - EPRINTF("resume malloc failed\n"); - err = -ENOMEM; - goto out; - } - - tapdisk_vbd_start_queue(vbd); - - for (i = 0; i < TD_VBD_EIO_RETRIES; i++) { - err = __tapdisk_vbd_open_vdi(vbd, TD_OPEN_STRICT); - if (err != -EIO) - break; - - sleep(TD_VBD_EIO_SLEEP); - } - -out: - if (!err) { - image_t image; - struct blktap2_params params; - - memset(¶ms, 0, sizeof(params)); - tapdisk_vbd_get_image_info(vbd, &image); - - params.sector_size = image.secsize; - params.capacity = image.size; - snprintf(params.name, sizeof(params.name) - 1, "%s", message); - - ioctl(vbd->ring.fd, BLKTAP2_IOCTL_SET_PARAMS, ¶ms); - td_flag_clear(vbd->state, TD_VBD_PAUSED); - } - - ioctl(vbd->ring.fd, BLKTAP2_IOCTL_RESUME, err); - return err; -} - -static int -tapdisk_vbd_check_ring_message(td_vbd_t *vbd) -{ - if (!vbd->ring.sring) - return -EINVAL; - - switch (vbd->ring.sring->private.tapif_user.msg) { - case 0: - return 0; - - case BLKTAP2_RING_MESSAGE_PAUSE: - return tapdisk_vbd_pause_ring(vbd); - - case BLKTAP2_RING_MESSAGE_RESUME: - return tapdisk_vbd_resume_ring(vbd); - - case BLKTAP2_RING_MESSAGE_CLOSE: - return tapdisk_vbd_close(vbd); - - default: - return -EINVAL; - } -} - -static void -tapdisk_vbd_ring_event(event_id_t id, char mode, void *private) -{ - td_vbd_t *vbd; - - vbd = (td_vbd_t *)private; - - tapdisk_vbd_pull_ring_requests(vbd); - tapdisk_vbd_issue_requests(vbd); - - /* vbd may be destroyed after this call */ - tapdisk_vbd_check_ring_message(vbd); -} - -td_image_t * -tapdisk_vbd_first_image(td_vbd_t *vbd) -{ - return list_entry(vbd->images.next, td_image_t, next); -} _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |