Index: xen-unstable.hg/tools/blktap/drivers/Makefile =================================================================== --- xen-unstable.hg.orig/tools/blktap/drivers/Makefile +++ xen-unstable.hg/tools/blktap/drivers/Makefile @@ -35,6 +35,7 @@ BLK-OBJS += block-vmdk.o BLK-OBJS += block-ram.o BLK-OBJS += block-qcow.o BLK-OBJS += aes.o +BLK-OBJS += tapaio.o all: $(IBIN) qcow-util Index: xen-unstable.hg/tools/blktap/drivers/block-aio.c =================================================================== --- xen-unstable.hg.orig/tools/blktap/drivers/block-aio.c +++ xen-unstable.hg/tools/blktap/drivers/block-aio.c @@ -43,14 +43,7 @@ #include #include #include "tapdisk.h" - - -/** - * We used a kernel patch to return an fd associated with the AIO context - * so that we can concurrently poll on synchronous and async descriptors. - * This is signalled by passing 1 as the io context to io_setup. - */ -#define REQUEST_ASYNC_FD 1 +#include "tapaio.h" #define MAX_AIO_REQS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ) @@ -65,14 +58,13 @@ struct tdaio_state { int fd; /* libaio state */ - io_context_t aio_ctx; + tap_aio_context_t aio_ctx; struct iocb iocb_list [MAX_AIO_REQS]; struct iocb *iocb_free [MAX_AIO_REQS]; struct pending_aio pending_aio[MAX_AIO_REQS]; int iocb_free_count; struct iocb *iocb_queue[MAX_AIO_REQS]; int iocb_queued; - int poll_fd; /* NB: we require aio_poll support */ struct io_event aio_events[MAX_AIO_REQS]; }; @@ -148,7 +140,7 @@ static inline void init_fds(struct disk_ for(i = 0; i < MAX_IOFD; i++) dd->io_fd[i] = 0; - dd->io_fd[0] = prv->poll_fd; + dd->io_fd[0] = prv->aio_ctx.pollfd; } /* Open the disk file and initialize aio state. */ @@ -162,12 +154,9 @@ int tdaio_open (struct disk_driver *dd, /* Initialize AIO */ prv->iocb_free_count = MAX_AIO_REQS; prv->iocb_queued = 0; - - prv->aio_ctx = (io_context_t) REQUEST_ASYNC_FD; - prv->poll_fd = io_setup(MAX_AIO_REQS, &prv->aio_ctx); - if (prv->poll_fd < 0) { - ret = prv->poll_fd; + ret = tap_aio_setup(&prv->aio_ctx, prv->aio_events, MAX_AIO_REQS); + if (ret < 0) { if (ret == -EAGAIN) { DPRINTF("Couldn't setup AIO context. If you are " "trying to concurrently use a large number " @@ -176,9 +165,7 @@ int tdaio_open (struct disk_driver *dd, "(e.g. 'echo echo 1048576 > /proc/sys/fs/" "aio-max-nr')\n"); } else { - DPRINTF("Couldn't get fd for AIO poll support. This " - "is probably because your kernel does not " - "have the aio-poll patch applied.\n"); + DPRINTF("Couldn't setup AIO context.\n"); } goto done; } @@ -286,7 +273,7 @@ int tdaio_submit(struct disk_driver *dd) if (!prv->iocb_queued) return 0; - ret = io_submit(prv->aio_ctx, prv->iocb_queued, prv->iocb_queue); + ret = io_submit(prv->aio_ctx.aio_ctx, prv->iocb_queued, prv->iocb_queue); /* XXX: TODO: Handle error conditions here. */ @@ -300,7 +287,7 @@ int tdaio_close(struct disk_driver *dd) { struct tdaio_state *prv = (struct tdaio_state *)dd->private; - io_destroy(prv->aio_ctx); + io_destroy(prv->aio_ctx.aio_ctx); close(prv->fd); return 0; @@ -308,15 +295,13 @@ int tdaio_close(struct disk_driver *dd) int tdaio_do_callbacks(struct disk_driver *dd, int sid) { - int ret, i, rsp = 0; + int i, nr_events, rsp = 0; struct io_event *ep; struct tdaio_state *prv = (struct tdaio_state *)dd->private; - /* Non-blocking test for completed io. */ - ret = io_getevents(prv->aio_ctx, 0, MAX_AIO_REQS, prv->aio_events, - NULL); - - for (ep=prv->aio_events,i=ret; i-->0; ep++) { + nr_events = tap_aio_get_events(&prv->aio_ctx); +repeat: + for (ep = prv->aio_events, i = nr_events; i-- > 0; ep++) { struct iocb *io = ep->obj; struct pending_aio *pio; @@ -327,6 +312,14 @@ int tdaio_do_callbacks(struct disk_drive prv->iocb_free[prv->iocb_free_count++] = io; } + + if (nr_events) { + nr_events = tap_aio_more_events(&prv->aio_ctx); + goto repeat; + } + + tap_aio_continue(&prv->aio_ctx); + return rsp; } Index: xen-unstable.hg/tools/blktap/drivers/block-qcow.c =================================================================== --- xen-unstable.hg.orig/tools/blktap/drivers/block-qcow.c +++ xen-unstable.hg/tools/blktap/drivers/block-qcow.c @@ -38,6 +38,7 @@ #include "bswap.h" #include "aes.h" #include "tapdisk.h" +#include "tapaio.h" #if 1 #define ASSERT(_p) \ @@ -53,9 +54,6 @@ (l + (s - 1)) - ((l + (s - 1)) % s)); \ }) -/******AIO DEFINES******/ -#define REQUEST_ASYNC_FD 1 - struct pending_aio { td_callback_t cb; int id; @@ -145,7 +143,7 @@ struct tdqcow_state { AES_KEY aes_encrypt_key; /*AES key*/ AES_KEY aes_decrypt_key; /*AES key*/ /* libaio state */ - io_context_t aio_ctx; + tap_aio_context_t aio_ctx; int max_aio_reqs; struct iocb *iocb_list; struct iocb **iocb_free; @@ -153,7 +151,6 @@ struct tdqcow_state { int iocb_free_count; struct iocb **iocb_queue; int iocb_queued; - int poll_fd; /* NB: we require aio_poll support */ struct io_event *aio_events; }; @@ -179,7 +176,7 @@ static void free_aio_state(struct disk_d static int init_aio_state(struct disk_driver *dd) { - int i; + int i, ret; struct td_state *bs = dd->td_state; struct tdqcow_state *s = (struct tdqcow_state *)dd->private; long ioidx; @@ -216,12 +213,9 @@ static int init_aio_state(struct disk_dr goto fail; } - /*Signal kernel to create Poll FD for Asyc completion events*/ - s->aio_ctx = (io_context_t) REQUEST_ASYNC_FD; - s->poll_fd = io_setup(s->max_aio_reqs, &s->aio_ctx); - - if (s->poll_fd < 0) { - if (s->poll_fd == -EAGAIN) { + ret = tap_aio_setup(&s->aio_ctx, s->aio_events, s->max_aio_reqs); + if (ret < 0) { + if (ret == -EAGAIN) { DPRINTF("Couldn't setup AIO context. If you are " "trying to concurrently use a large number " "of blktap-based disks, you may need to " @@ -229,9 +223,7 @@ static int init_aio_state(struct disk_dr "(e.g. 'echo echo 1048576 > /proc/sys/fs/" "aio-max-nr')\n"); } else { - DPRINTF("Couldn't get fd for AIO poll support. This " - "is probably because your kernel does not " - "have the aio-poll patch applied.\n"); + DPRINTF("Couldn't setup AIO context.\n"); } goto fail; } @@ -845,7 +837,7 @@ static inline void init_fds(struct disk_ for(i = 0; i < MAX_IOFD; i++) dd->io_fd[i] = 0; - dd->io_fd[0] = s->poll_fd; + dd->io_fd[0] = s->aio_ctx.pollfd; } /* Open the disk file and initialize qcow state. */ @@ -1144,7 +1136,7 @@ int tdqcow_submit(struct disk_driver *dd if (!prv->iocb_queued) return 0; - ret = io_submit(prv->aio_ctx, prv->iocb_queued, prv->iocb_queue); + ret = io_submit(prv->aio_ctx.aio_ctx, prv->iocb_queued, prv->iocb_queue); /* XXX: TODO: Handle error conditions here. */ @@ -1172,7 +1164,7 @@ int tdqcow_close(struct disk_driver *dd) close(fd); } - io_destroy(s->aio_ctx); + io_destroy(s->aio_ctx.aio_ctx); free(s->name); free(s->l1_table); free(s->l2_cache); @@ -1184,17 +1176,15 @@ int tdqcow_close(struct disk_driver *dd) int tdqcow_do_callbacks(struct disk_driver *dd, int sid) { - int ret, i, rsp = 0,*ptr; + int ret, i, nr_events, rsp = 0,*ptr; struct io_event *ep; struct tdqcow_state *prv = (struct tdqcow_state *)dd->private; if (sid > MAX_IOFD) return 1; - - /* Non-blocking test for completed io. */ - ret = io_getevents(prv->aio_ctx, 0, prv->max_aio_reqs, prv->aio_events, - NULL); - for (ep = prv->aio_events, i = ret; i-- > 0; ep++) { + nr_events = tap_aio_get_events(&prv->aio_ctx); +repeat: + for (ep = prv->aio_events, i = nr_events; i-- > 0; ep++) { struct iocb *io = ep->obj; struct pending_aio *pio; @@ -1215,6 +1205,14 @@ int tdqcow_do_callbacks(struct disk_driv prv->iocb_free[prv->iocb_free_count++] = io; } + + if (nr_events) { + nr_events = tap_aio_more_events(&prv->aio_ctx); + goto repeat; + } + + tap_aio_continue(&prv->aio_ctx); + return rsp; } Index: xen-unstable.hg/tools/blktap/drivers/tapaio.c =================================================================== --- /dev/null +++ xen-unstable.hg/tools/blktap/drivers/tapaio.c @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2006 Andrew Warfield and Julian Chesterfield + * Copyright (c) 2007 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "tapaio.h" +#include "tapdisk.h" +#include + +/** + * We used a kernel patch to return an fd associated with the AIO context + * so that we can concurrently poll on synchronous and async descriptors. + * This is signalled by passing 1 as the io context to io_setup. + */ +#define REQUEST_ASYNC_FD 1 + +/* + * If we don't have any way to do epoll on aio events in a normal kernel, + * wait for aio events in a separate thread and return completion status + * that via a pipe that can be waited on normally. + * + * To keep locking problems between the completion thread and the submit + * thread to a minimum, there's a handshake which allows only one thread + * to be doing work on the completion queue at a time: + * + * 1) main thread sends completion thread a command via the command pipe; + * 2) completion thread waits for aio events and returns the number + * received on the completion pipe + * 3) main thread processes the received ctx->aio_events events + * 4) loop back to 1) to let the completion thread refill the aio_events + * buffer. + * + * This workaround needs to disappear once the kernel provides a single + * mechanism for waiting on both aio and normal fd wakeups. + */ +static void * +tap_aio_completion_thread(void *arg) +{ + tap_aio_context_t *ctx = (tap_aio_context_t *) arg; + int command; + int nr_events; + int rc; + + while (1) { + rc = read(ctx->command_fd[0], &command, sizeof(command)); + + do { + rc = io_getevents(ctx->aio_ctx, 1, + ctx->max_aio_events, ctx->aio_events, + NULL); + if (rc) { + nr_events = rc; + rc = write(ctx->completion_fd[1], &nr_events, + sizeof(nr_events)); + } + } while (!rc); + } +} + +void +tap_aio_continue(tap_aio_context_t *ctx) +{ + int cmd = 0; + + if (!ctx->poll_in_thread) + return; + + if (write(ctx->command_fd[1], &cmd, sizeof(cmd)) < 0) + DPRINTF("Cannot write to command pipe\n"); +} + +int +tap_aio_setup(tap_aio_context_t *ctx, + struct io_event *aio_events, + int max_aio_events) +{ + int ret; + + ctx->aio_events = aio_events; + ctx->max_aio_events = max_aio_events; + ctx->poll_in_thread = 0; + + ctx->aio_ctx = (io_context_t) REQUEST_ASYNC_FD; + ret = io_setup(ctx->max_aio_events, &ctx->aio_ctx); + if (ret < 0 && ret != -EINVAL) + return ret; + else if (ret > 0) { + ctx->pollfd = ret; + return ctx->pollfd; + } + + ctx->aio_ctx = (io_context_t) 0; + ret = io_setup(ctx->max_aio_events, &ctx->aio_ctx); + if (ret < 0) + return ret; + + if ((ret = pipe(ctx->command_fd)) < 0) { + DPRINTF("Unable to create command pipe\n"); + return -1; + } + if ((ret = pipe(ctx->completion_fd)) < 0) { + DPRINTF("Unable to create completion pipe\n"); + return -1; + } + + if ((ret = pthread_create(&ctx->aio_thread, NULL, + tap_aio_completion_thread, ctx)) != 0) { + DPRINTF("Unable to create completion thread\n"); + return -1; + } + + ctx->pollfd = ctx->completion_fd[0]; + ctx->poll_in_thread = 1; + + tap_aio_continue(ctx); + + return 0; +} + +int +tap_aio_get_events(tap_aio_context_t *ctx) +{ + int nr_events = 0; + + if (!ctx->poll_in_thread) + nr_events = io_getevents(ctx->aio_ctx, 1, + ctx->max_aio_events, ctx->aio_events, NULL); + else + read(ctx->completion_fd[0], &nr_events, sizeof(nr_events)); + + return nr_events; +} + +int tap_aio_more_events(tap_aio_context_t *ctx) +{ + return io_getevents(ctx->aio_ctx, 0, + ctx->max_aio_events, ctx->aio_events, NULL); +} + + Index: xen-unstable.hg/tools/blktap/drivers/tapaio.h =================================================================== --- /dev/null +++ xen-unstable.hg/tools/blktap/drivers/tapaio.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2006 Andrew Warfield and Julian Chesterfield + * Copyright (c) 2007 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __TAPAIO_H__ +#define __TAPAIO_H__ + +#include +#include + +struct tap_aio_context { + io_context_t aio_ctx; + + struct io_event *aio_events; + int max_aio_events; + + pthread_t aio_thread; + int command_fd[2]; + int completion_fd[2]; + int pollfd; + unsigned int poll_in_thread : 1; +}; + +typedef struct tap_aio_context tap_aio_context_t; + +int tap_aio_setup (tap_aio_context_t *ctx, + struct io_event *aio_events, + int max_aio_events); +void tap_aio_continue (tap_aio_context_t *ctx); +int tap_aio_get_events (tap_aio_context_t *ctx); +int tap_aio_more_events(tap_aio_context_t *ctx); + +#endif /* __TAPAIO_H__ */