Xen project Mailing List

Re: [PATCH v2 5/6] block/linux-aio: convert to blk_io_plug_call() API

On Wed, May 24, 2023 at 03:36:34PM -0400, Stefan Hajnoczi wrote:

On Wed, May 24, 2023 at 10:52:03AM +0200, Stefano Garzarella wrote:

On Tue, May 23, 2023 at 01:12:59PM -0400, Stefan Hajnoczi wrote:
> Stop using the .bdrv_co_io_plug() API because it is not multi-queue
> block layer friendly. Use the new blk_io_plug_call() API to batch I/O
> submission instead.
>
> Signed-off-by: Stefan Hajnoczi <stefanha@xxxxxxxxxx>
> Reviewed-by: Eric Blake <eblake@xxxxxxxxxx>
> ---
> include/block/raw-aio.h |  7 -------
> block/file-posix.c      | 28 ----------------------------
> block/linux-aio.c       | 41 +++++++++++------------------------------
> 3 files changed, 11 insertions(+), 65 deletions(-)
>
> diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
> index da60ca13ef..0f63c2800c 100644
> --- a/include/block/raw-aio.h
> +++ b/include/block/raw-aio.h
> @@ -62,13 +62,6 @@ int coroutine_fn laio_co_submit(int fd, uint64_t offset, 
QEMUIOVector *qiov,
>
> void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
> void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
> -
> -/*
> - * laio_io_plug/unplug work in the thread's current AioContext, therefore the
> - * caller must ensure that they are paired in the same IOThread.
> - */
> -void laio_io_plug(void);
> -void laio_io_unplug(uint64_t dev_max_batch);
> #endif
> /* io_uring.c - Linux io_uring implementation */
> #ifdef CONFIG_LINUX_IO_URING
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 7baa8491dd..ac1ed54811 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -2550,26 +2550,6 @@ static int coroutine_fn 
raw_co_pwritev(BlockDriverState *bs, int64_t offset,
>     return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
> }
>
> -static void coroutine_fn raw_co_io_plug(BlockDriverState *bs)
> -{
> -    BDRVRawState __attribute__((unused)) *s = bs->opaque;
> -#ifdef CONFIG_LINUX_AIO
> -    if (s->use_linux_aio) {
> -        laio_io_plug();
> -    }
> -#endif
> -}
> -
> -static void coroutine_fn raw_co_io_unplug(BlockDriverState *bs)
> -{
> -    BDRVRawState __attribute__((unused)) *s = bs->opaque;
> -#ifdef CONFIG_LINUX_AIO
> -    if (s->use_linux_aio) {
> -        laio_io_unplug(s->aio_max_batch);
> -    }
> -#endif
> -}
> -
> static int coroutine_fn raw_co_flush_to_disk(BlockDriverState *bs)
> {
>     BDRVRawState *s = bs->opaque;
> @@ -3914,8 +3894,6 @@ BlockDriver bdrv_file = {
>     .bdrv_co_copy_range_from = raw_co_copy_range_from,
>     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
>     .bdrv_refresh_limits = raw_refresh_limits,
> -    .bdrv_co_io_plug        = raw_co_io_plug,
> -    .bdrv_co_io_unplug      = raw_co_io_unplug,
>     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>
>     .bdrv_co_truncate                   = raw_co_truncate,
> @@ -4286,8 +4264,6 @@ static BlockDriver bdrv_host_device = {
>     .bdrv_co_copy_range_from = raw_co_copy_range_from,
>     .bdrv_co_copy_range_to  = raw_co_copy_range_to,
>     .bdrv_refresh_limits = raw_refresh_limits,
> -    .bdrv_co_io_plug        = raw_co_io_plug,
> -    .bdrv_co_io_unplug      = raw_co_io_unplug,
>     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>
>     .bdrv_co_truncate                   = raw_co_truncate,
> @@ -4424,8 +4400,6 @@ static BlockDriver bdrv_host_cdrom = {
>     .bdrv_co_pwritev        = raw_co_pwritev,
>     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
>     .bdrv_refresh_limits    = cdrom_refresh_limits,
> -    .bdrv_co_io_plug        = raw_co_io_plug,
> -    .bdrv_co_io_unplug      = raw_co_io_unplug,
>     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>
>     .bdrv_co_truncate                   = raw_co_truncate,
> @@ -4552,8 +4526,6 @@ static BlockDriver bdrv_host_cdrom = {
>     .bdrv_co_pwritev        = raw_co_pwritev,
>     .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
>     .bdrv_refresh_limits    = cdrom_refresh_limits,
> -    .bdrv_co_io_plug        = raw_co_io_plug,
> -    .bdrv_co_io_unplug      = raw_co_io_unplug,
>     .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>
>     .bdrv_co_truncate                   = raw_co_truncate,
> diff --git a/block/linux-aio.c b/block/linux-aio.c
> index 442c86209b..5021aed68f 100644
> --- a/block/linux-aio.c
> +++ b/block/linux-aio.c
> @@ -15,6 +15,7 @@
> #include "qemu/event_notifier.h"
> #include "qemu/coroutine.h"
> #include "qapi/error.h"
> +#include "sysemu/block-backend.h"
>
> /* Only used for assertions.  */
> #include "qemu/coroutine_int.h"
> @@ -46,7 +47,6 @@ struct qemu_laiocb {
> };
>
> typedef struct {
> -    int plugged;
>     unsigned int in_queue;
>     unsigned int in_flight;
>     bool blocked;
> @@ -236,7 +236,7 @@ static void 
qemu_laio_process_completions_and_submit(LinuxAioState *s)
> {
>     qemu_laio_process_completions(s);
>
> -    if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
> +    if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
>         ioq_submit(s);
>     }
> }
> @@ -277,7 +277,6 @@ static void qemu_laio_poll_ready(EventNotifier *opaque)
> static void ioq_init(LaioQueue *io_q)
> {
>     QSIMPLEQ_INIT(&io_q->pending);
> -    io_q->plugged = 0;
>     io_q->in_queue = 0;
>     io_q->in_flight = 0;
>     io_q->blocked = false;
> @@ -354,31 +353,11 @@ static uint64_t laio_max_batch(LinuxAioState *s, 
uint64_t dev_max_batch)
>     return max_batch;
> }
>
> -void laio_io_plug(void)
> +static void laio_unplug_fn(void *opaque)
> {
> -    AioContext *ctx = qemu_get_current_aio_context();
> -    LinuxAioState *s = aio_get_linux_aio(ctx);
> +    LinuxAioState *s = opaque;
>
> -    s->io_q.plugged++;
> -}
> -
> -void laio_io_unplug(uint64_t dev_max_batch)
> -{
> -    AioContext *ctx = qemu_get_current_aio_context();
> -    LinuxAioState *s = aio_get_linux_aio(ctx);
> -
> -    assert(s->io_q.plugged);
> -    s->io_q.plugged--;
> -
> -    /*
> -     * Why max batch checking is performed here:
> -     * Another BDS may have queued requests with a higher dev_max_batch and
> -     * therefore in_queue could now exceed our dev_max_batch. Re-check the 
max
> -     * batch so we can honor our device's dev_max_batch.
> -     */
> -    if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch) ||

Why are we removing this condition?
Could the same situation occur with the new API?


The semantics of unplug_fn() are different from .bdrv_co_unplug():
1. unplug_fn() is only called when the last blk_io_unplug() call occurs,
  not every time blk_io_unplug() is called.
2. unplug_fn() is per-thread, not per-BlockDriverState, so there is no
  way to get per-BlockDriverState fields like dev_max_batch.

Therefore this condition cannot be moved to laio_unplug_fn().

I see now.


How important is this condition? I believe that dropping it does not
have much of an effect but maybe I missed something.

With Kevin we agreed to add it to avoid extra latency in some devices, but we didn't do much testing on this. IIRC what solved the performance degradation was the check in laio_do_submit() that we still have after this changes. So it may not have much effect, but maybe it's worth mentioning in the commit description.


Also, does it make sense to define per-BlockDriverState batching limits
when the AIO engine (Linux AIO or io_uring) is thread-local and shared
between all BlockDriverStates? I believe the fundamental reason (that we
discovered later) why dev_max_batch is effective is because the Linux
kernel processes 32 I/O request submissions at a time. Anything above 32
adds latency without a batching benefit.

This is a good point, maybe we should confirm it with some tests though. Thanks, Stefano

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.