[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH RFC 12/12] xen-block: implement indirect descriptors



On Thu, Feb 28, 2013 at 11:28:55AM +0100, Roger Pau Monne wrote:
> Indirect descriptors introduce a new block operation
> (BLKIF_OP_INDIRECT) that passes grant references instead of segments
> in the request. This grant references are filled with arrays of
> blkif_request_segment_aligned, this way we can send more segments in a
> request.
> 
> The proposed implementation sets the maximum number of indirect grefs
> (frames filled with blkif_request_segment_aligned) to 256 in the
> backend and 64 in the frontend. The value in the frontend has been
> chosen experimentally, and the backend value has been set to a sane
> value that allows expanding the maximum number of indirect descriptors
> in the frontend if needed.

So we are still using a similar format of the form:

<gref, first_sec, last_sect, pad>, etc.

Why not utilize a layout that fits with the bio sg? That way
we might not even have to do the bio_alloc call and instead can
setup an bio (and bio-list) with the appropiate offsets/list?

Meaning that the format of the indirect descriptors is:

<gref, offset, next_index, pad>

We already know what the first_sec and last_sect are - they
are basically: sector_number +  nr_segments * (whatever the sector size is) + 
offset



> 
> The migration code has changed from the previous implementation, in
> which we simply remapped the segments on the shared ring. Now the
> maximum number of segments allowed in a request can change depending
> on the backend, so we have to requeue all the requests in the ring and
> in the queue and split the bios in them if they are bigger than the
> new maximum number of segments.
> 
> Signed-off-by: Roger Pau Monné <roger.pau@xxxxxxxxxx>
> Cc: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
> Cc: xen-devel@xxxxxxxxxxxxx
> ---
>  drivers/block/xen-blkback/blkback.c |  129 +++++++---
>  drivers/block/xen-blkback/common.h  |   80 ++++++-
>  drivers/block/xen-blkback/xenbus.c  |    8 +
>  drivers/block/xen-blkfront.c        |  498 
> +++++++++++++++++++++++++++++------
>  include/xen/interface/io/blkif.h    |   25 ++
>  5 files changed, 622 insertions(+), 118 deletions(-)
> 
> diff --git a/drivers/block/xen-blkback/blkback.c 
> b/drivers/block/xen-blkback/blkback.c
> index 0fa30db..98eb16b 100644
> --- a/drivers/block/xen-blkback/blkback.c
> +++ b/drivers/block/xen-blkback/blkback.c
> @@ -70,7 +70,7 @@ MODULE_PARM_DESC(reqs, "Number of blkback requests to 
> allocate per backend");
>   * algorithm.
>   */
>  
> -static int xen_blkif_max_pgrants = 352;
> +static int xen_blkif_max_pgrants = 1024;
>  module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
>  MODULE_PARM_DESC(max_persistent_grants,
>                   "Maximum number of grants to map persistently");
> @@ -578,10 +578,6 @@ purge_gnt_list:
>       return 0;
>  }
>  
> -struct seg_buf {
> -     unsigned long buf;
> -     unsigned int nsec;
> -};
>  /*
>   * Unmap the grant references, and also remove the M2P over-rides
>   * used in the 'pending_req'.
> @@ -761,32 +757,79 @@ out_of_memory:
>       return -ENOMEM;
>  }
>  
> -static int xen_blkbk_map_seg(struct blkif_request *req,
> -                          struct pending_req *pending_req,
> +static int xen_blkbk_map_seg(struct pending_req *pending_req,
>                            struct seg_buf seg[],
>                            struct page *pages[])
>  {
>       int i, rc;
> -     grant_ref_t grefs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
>  
> -     for (i = 0; i < req->u.rw.nr_segments; i++)
> -             grefs[i] = req->u.rw.seg[i].gref;
> -
> -     rc = xen_blkbk_map(pending_req->blkif, grefs,
> +     rc = xen_blkbk_map(pending_req->blkif, pending_req->grefs,
>                          pending_req->persistent_gnts,
>                          pending_req->grant_handles, pending_req->pages,
> -                        req->u.rw.nr_segments,
> +                        pending_req->nr_pages,
>                          (pending_req->operation != BLKIF_OP_READ));
>       if (rc)
>               return rc;
>  
> -     for (i = 0; i < req->u.rw.nr_segments; i++)
> -             seg[i].buf = pfn_to_mfn(page_to_pfn(pending_req->pages[i]))
> -                          << PAGE_SHIFT | (req->u.rw.seg[i].first_sect << 9);
> +     for (i = 0; i < pending_req->nr_pages; i++)
> +             seg[i].buf |= pfn_to_mfn(page_to_pfn(pending_req->pages[i]))
> +                          << PAGE_SHIFT;
>  
>       return 0;
>  }
>  
> +static int xen_blkbk_parse_indirect(struct blkif_request *req,
> +                                    struct pending_req *pending_req,
> +                                    struct seg_buf seg[],
> +                                    struct phys_req *preq)
> +{
> +     struct persistent_gnt **persistent =
> +             pending_req->indirect_persistent_gnts;
> +     struct page **pages = pending_req->indirect_pages;
> +     struct xen_blkif *blkif = pending_req->blkif;
> +     int indirect_grefs, rc, n, nseg, i;
> +     struct blkif_request_segment_aligned *segments = NULL;
> +
> +     nseg = pending_req->nr_pages;
> +     indirect_grefs = (nseg + SEGS_PER_INDIRECT_FRAME - 1) /
> +                      SEGS_PER_INDIRECT_FRAME;
> +
> +     rc = xen_blkbk_map(blkif, req->u.indirect.indirect_grefs,
> +                        persistent, pending_req->indirect_handles,
> +                        pages, indirect_grefs, true);
> +     if (rc)
> +             goto unmap;
> +
> +     for (n = 0, i = 0; n < nseg; n++) {
> +             if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
> +                     /* Map indirect segments */
> +                     if (segments)
> +                             kunmap_atomic(segments);
> +                     segments =
> +                             kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]);
> +             }
> +             i = n % SEGS_PER_INDIRECT_FRAME;
> +             pending_req->grefs[n] = segments[i].gref;
> +             seg[n].nsec = segments[i].last_sect -
> +                     segments[i].first_sect + 1;
> +             seg[n].buf = segments[i].first_sect << 9;
> +             if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) ||
> +                 (segments[i].last_sect <
> +                  segments[i].first_sect)) {
> +                     rc = -EINVAL;
> +                     goto unmap;
> +             }
> +             preq->nr_sects += seg[n].nsec;
> +     }
> +
> +unmap:
> +     if (segments)
> +             kunmap_atomic(segments);
> +     xen_blkbk_unmap(blkif, pending_req->indirect_handles,
> +                        pages, persistent, indirect_grefs);
> +     return rc;
> +}
> +
>  static int dispatch_discard_io(struct xen_blkif *blkif,
>                               struct blkif_request *req)
>  {
> @@ -980,17 +1023,21 @@ static int dispatch_rw_block_io(struct xen_blkif 
> *blkif,
>                               struct pending_req *pending_req)
>  {
>       struct phys_req preq;
> -     struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> +     struct seg_buf *seg = pending_req->seg;
>       unsigned int nseg;
>       struct bio *bio = NULL;
> -     struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> +     struct bio **biolist = pending_req->biolist;
>       int i, nbio = 0;
>       int operation;
>       struct blk_plug plug;
>       bool drain = false;
>       struct page **pages = pending_req->pages;
> +     unsigned short req_operation;
> +
> +     req_operation = req->operation == BLKIF_OP_INDIRECT ?
> +                     req->u.indirect.indirect_op : req->operation;
>  
> -     switch (req->operation) {
> +     switch (req_operation) {
>       case BLKIF_OP_READ:
>               blkif->st_rd_req++;
>               operation = READ;
> @@ -1012,33 +1059,49 @@ static int dispatch_rw_block_io(struct xen_blkif 
> *blkif,
>       }
>  
>       /* Check that the number of segments is sane. */
> -     nseg = req->u.rw.nr_segments;
> +     nseg = req->operation == BLKIF_OP_INDIRECT ?
> +            req->u.indirect.nr_segments : req->u.rw.nr_segments;
>  
>       if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
> -         unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
> +         unlikely((req->operation != BLKIF_OP_INDIRECT) &&
> +                  (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
> +         unlikely((req->operation == BLKIF_OP_INDIRECT) &&
> +                  (nseg > MAX_INDIRECT_SEGMENTS))) {
>               pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
>                        nseg);
>               /* Haven't submitted any bio's yet. */
>               goto fail_response;
>       }
>  
> -     preq.sector_number = req->u.rw.sector_number;
>       preq.nr_sects      = 0;
>  
>       pending_req->blkif     = blkif;
> -     pending_req->id        = req->u.rw.id;
> -     pending_req->operation = req->operation;
>       pending_req->status    = BLKIF_RSP_OKAY;
>       pending_req->nr_pages  = nseg;
>  
> -     for (i = 0; i < nseg; i++) {
> -             seg[i].nsec = req->u.rw.seg[i].last_sect -
> -                     req->u.rw.seg[i].first_sect + 1;
> -             if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
> -                 (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
> +     if (req->operation != BLKIF_OP_INDIRECT) {
> +             preq.dev               = req->u.rw.handle;
> +             preq.sector_number     = req->u.rw.sector_number;
> +             pending_req->id        = req->u.rw.id;
> +             pending_req->operation = req->operation;
> +             for (i = 0; i < nseg; i++) {
> +                     pending_req->grefs[i] = req->u.rw.seg[i].gref;
> +                     seg[i].nsec = req->u.rw.seg[i].last_sect -
> +                             req->u.rw.seg[i].first_sect + 1;
> +                     seg[i].buf = req->u.rw.seg[i].first_sect << 9;
> +                     if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
> +                         (req->u.rw.seg[i].last_sect <
> +                          req->u.rw.seg[i].first_sect))
> +                             goto fail_response;
> +                     preq.nr_sects += seg[i].nsec;
> +             }
> +     } else {
> +             preq.dev               = req->u.indirect.handle;
> +             preq.sector_number     = req->u.indirect.sector_number;
> +             pending_req->id        = req->u.indirect.id;
> +             pending_req->operation = req->u.indirect.indirect_op;
> +             if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
>                       goto fail_response;
> -             preq.nr_sects += seg[i].nsec;
> -
>       }
>  
>       if (xen_vbd_translate(&preq, blkif, operation) != 0) {
> @@ -1074,7 +1137,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
>        * the hypercall to unmap the grants - that is all done in
>        * xen_blkbk_unmap.
>        */
> -     if (xen_blkbk_map_seg(req, pending_req, seg, pages))
> +     if (xen_blkbk_map_seg(pending_req, seg, pages))
>               goto fail_flush;
>  
>       /*
> @@ -1146,7 +1209,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
>                       pending_req->nr_pages);
>   fail_response:
>       /* Haven't submitted any bio's yet. */
> -     make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR);
> +     make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
>       free_req(blkif, pending_req);
>       msleep(1); /* back off a bit */
>       return -EIO;
> diff --git a/drivers/block/xen-blkback/common.h 
> b/drivers/block/xen-blkback/common.h
> index 0b0ad3f..d3656d2 100644
> --- a/drivers/block/xen-blkback/common.h
> +++ b/drivers/block/xen-blkback/common.h
> @@ -50,6 +50,17 @@
>                __func__, __LINE__, ##args)
>  
>  
> +/*
> + * This is the maximum number of segments that would be allowed in indirect
> + * requests. This value will also be passed to the frontend.
> + */
> +#define MAX_INDIRECT_SEGMENTS 256
> +
> +#define SEGS_PER_INDIRECT_FRAME \
> +(PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
> +#define MAX_INDIRECT_GREFS \
> +((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 
> 1)/SEGS_PER_INDIRECT_FRAME)
> +
>  /* Not a real protocol.  Used to generate ring structs which contain
>   * the elements common to all protocols only.  This way we get a
>   * compiler-checkable way to use common struct elements, so we can
> @@ -77,11 +88,21 @@ struct blkif_x86_32_request_discard {
>       uint64_t       nr_sectors;
>  } __attribute__((__packed__));
>  
> +struct blkif_x86_32_request_indirect {
> +     uint8_t        indirect_op;
> +     uint16_t       nr_segments;
> +     uint64_t       id;
> +     blkif_vdev_t   handle;
> +     blkif_sector_t sector_number;
> +     grant_ref_t    indirect_grefs[BLKIF_MAX_INDIRECT_GREFS_PER_REQUEST];
> +} __attribute__((__packed__));
> +
>  struct blkif_x86_32_request {
>       uint8_t        operation;    /* BLKIF_OP_???                         */
>       union {
>               struct blkif_x86_32_request_rw rw;
>               struct blkif_x86_32_request_discard discard;
> +             struct blkif_x86_32_request_indirect indirect;
>       } u;
>  } __attribute__((__packed__));
>  
> @@ -113,11 +134,22 @@ struct blkif_x86_64_request_discard {
>       uint64_t       nr_sectors;
>  } __attribute__((__packed__));
>  
> +struct blkif_x86_64_request_indirect {
> +     uint8_t        indirect_op;
> +     uint16_t       nr_segments;
> +     uint32_t       _pad1;        /* offsetof(blkif_..,u.indirect.id)==8   */
> +     uint64_t       id;
> +     blkif_vdev_t   handle;
> +     blkif_sector_t sector_number;
> +     grant_ref_t    indirect_grefs[BLKIF_MAX_INDIRECT_GREFS_PER_REQUEST];
> +} __attribute__((__packed__));
> +
>  struct blkif_x86_64_request {
>       uint8_t        operation;    /* BLKIF_OP_???                         */
>       union {
>               struct blkif_x86_64_request_rw rw;
>               struct blkif_x86_64_request_discard discard;
> +             struct blkif_x86_64_request_indirect indirect;
>       } u;
>  } __attribute__((__packed__));
>  
> @@ -235,6 +267,11 @@ struct xen_blkif {
>       wait_queue_head_t       waiting_to_free;
>  };
>  
> +struct seg_buf {
> +     unsigned long buf;
> +     unsigned int nsec;
> +};
> +
>  /*
>   * Each outstanding request that we've passed to the lower device layers has 
> a
>   * 'pending_req' allocated to it. Each buffer_head that completes decrements
> @@ -249,9 +286,16 @@ struct pending_req {
>       unsigned short          operation;
>       int                     status;
>       struct list_head        free_list;
> -     struct persistent_gnt   
> *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> -     struct page             *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> -     grant_handle_t          grant_handles[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> +     struct persistent_gnt   *persistent_gnts[MAX_INDIRECT_SEGMENTS];
> +     struct page             *pages[MAX_INDIRECT_SEGMENTS];
> +     grant_handle_t          grant_handles[MAX_INDIRECT_SEGMENTS];
> +     grant_ref_t             grefs[MAX_INDIRECT_SEGMENTS];
> +     /* Indirect descriptors */
> +     struct persistent_gnt   *indirect_persistent_gnts[MAX_INDIRECT_GREFS];
> +     struct page             *indirect_pages[MAX_INDIRECT_GREFS];
> +     grant_handle_t          indirect_handles[MAX_INDIRECT_GREFS];
> +     struct seg_buf          seg[MAX_INDIRECT_SEGMENTS];
> +     struct bio              *biolist[MAX_INDIRECT_SEGMENTS];
>  };
>  
>  
> @@ -289,7 +333,7 @@ struct xenbus_device *xen_blkbk_xenbus(struct 
> backend_info *be);
>  static inline void blkif_get_x86_32_req(struct blkif_request *dst,
>                                       struct blkif_x86_32_request *src)
>  {
> -     int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
> +     int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j = MAX_INDIRECT_GREFS;
>       dst->operation = src->operation;
>       switch (src->operation) {
>       case BLKIF_OP_READ:
> @@ -312,6 +356,19 @@ static inline void blkif_get_x86_32_req(struct 
> blkif_request *dst,
>               dst->u.discard.sector_number = src->u.discard.sector_number;
>               dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
>               break;
> +     case BLKIF_OP_INDIRECT:
> +             dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
> +             dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
> +             dst->u.indirect.handle = src->u.indirect.handle;
> +             dst->u.indirect.id = src->u.indirect.id;
> +             dst->u.indirect.sector_number = src->u.indirect.sector_number;
> +             barrier();
> +             if (j > dst->u.indirect.nr_segments)
> +                     j = dst->u.indirect.nr_segments;
> +             for (i = 0; i < j; i++)
> +                     dst->u.indirect.indirect_grefs[i] =
> +                             src->u.indirect.indirect_grefs[i];
> +             break;
>       default:
>               break;
>       }
> @@ -320,7 +377,7 @@ static inline void blkif_get_x86_32_req(struct 
> blkif_request *dst,
>  static inline void blkif_get_x86_64_req(struct blkif_request *dst,
>                                       struct blkif_x86_64_request *src)
>  {
> -     int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
> +     int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j = MAX_INDIRECT_GREFS;
>       dst->operation = src->operation;
>       switch (src->operation) {
>       case BLKIF_OP_READ:
> @@ -343,6 +400,19 @@ static inline void blkif_get_x86_64_req(struct 
> blkif_request *dst,
>               dst->u.discard.sector_number = src->u.discard.sector_number;
>               dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
>               break;
> +     case BLKIF_OP_INDIRECT:
> +             dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
> +             dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
> +             dst->u.indirect.handle = src->u.indirect.handle;
> +             dst->u.indirect.id = src->u.indirect.id;
> +             dst->u.indirect.sector_number = src->u.indirect.sector_number;
> +             barrier();
> +             if (j > dst->u.indirect.nr_segments)
> +                     j = dst->u.indirect.nr_segments;
> +             for (i = 0; i < j; i++)
> +                     dst->u.indirect.indirect_grefs[i] =
> +                             src->u.indirect.indirect_grefs[i];
> +             break;
>       default:
>               break;
>       }
> diff --git a/drivers/block/xen-blkback/xenbus.c 
> b/drivers/block/xen-blkback/xenbus.c
> index 8f929cb..9e16abb 100644
> --- a/drivers/block/xen-blkback/xenbus.c
> +++ b/drivers/block/xen-blkback/xenbus.c
> @@ -700,6 +700,14 @@ again:
>               goto abort;
>       }
>  
> +     err = xenbus_printf(xbt, dev->nodename, "max-indirect-segments", "%u",
> +                         MAX_INDIRECT_SEGMENTS);
> +     if (err) {
> +             xenbus_dev_fatal(dev, err, "writing %s/max-indirect-segments",
> +                              dev->nodename);
> +             goto abort;
> +     }
> +
>       err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
>                           (unsigned long long)vbd_sz(&be->blkif->vbd));
>       if (err) {
> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
> index 4d81fcc..074d302 100644
> --- a/drivers/block/xen-blkfront.c
> +++ b/drivers/block/xen-blkfront.c
> @@ -74,12 +74,30 @@ struct grant {
>  struct blk_shadow {
>       struct blkif_request req;
>       struct request *request;
> -     struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> +     struct grant **grants_used;
> +     struct grant **indirect_grants;
> +};
> +
> +struct split_bio {
> +     struct bio *bio;
> +     atomic_t pending;
> +     int err;
>  };
>  
>  static DEFINE_MUTEX(blkfront_mutex);
>  static const struct block_device_operations xlvbd_block_fops;
>  
> +/*
> + * Maximum number of segments in indirect requests, the actual value used by
> + * the frontend driver is the minimum of this value and the value provided
> + * by the backend driver.
> + */
> +
> +static int xen_blkif_max_segments = 64;
> +module_param_named(max_segments, xen_blkif_max_segments, int, 0);
> +MODULE_PARM_DESC(max_segments,
> +"Maximum number of segments in indirect requests");
> +
>  #define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
>  
>  /*
> @@ -98,7 +116,7 @@ struct blkfront_info
>       enum blkif_state connected;
>       int ring_ref;
>       struct blkif_front_ring ring;
> -     struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> +     struct scatterlist *sg;
>       unsigned int evtchn, irq;
>       struct request_queue *rq;
>       struct work_struct work;
> @@ -114,6 +132,8 @@ struct blkfront_info
>       unsigned int discard_granularity;
>       unsigned int discard_alignment;
>       unsigned int feature_persistent:1;
> +     unsigned int max_indirect_segments;
> +     unsigned int sector_size;
>       int is_ready;
>  };
>  
> @@ -142,6 +162,14 @@ static DEFINE_SPINLOCK(minor_lock);
>  
>  #define DEV_NAME     "xvd"   /* name in /dev */
>  
> +#define SEGS_PER_INDIRECT_FRAME \
> +     (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
> +#define INDIRECT_GREFS(_segs) \
> +     ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
> +#define MIN(_a, _b) ((_a) < (_b) ? (_a) : (_b))
> +
> +static int blkfront_setup_indirect(struct blkfront_info *info);
> +
>  static int get_id_from_freelist(struct blkfront_info *info)
>  {
>       unsigned long free = info->shadow_free;
> @@ -358,7 +386,8 @@ static int blkif_queue_request(struct request *req)
>       struct blkif_request *ring_req;
>       unsigned long id;
>       unsigned int fsect, lsect;
> -     int i, ref;
> +     int i, ref, n;
> +     struct blkif_request_segment_aligned *segments = NULL;
>  
>       /*
>        * Used to store if we are able to queue the request by just using
> @@ -369,21 +398,27 @@ static int blkif_queue_request(struct request *req)
>       grant_ref_t gref_head;
>       struct grant *gnt_list_entry = NULL;
>       struct scatterlist *sg;
> +     int nseg, max_grefs;
>  
>       if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
>               return 1;
>  
> -     /* Check if we have enought grants to allocate a requests */
> -     if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) {
> +     max_grefs = info->max_indirect_segments ?
> +                 info->max_indirect_segments +
> +                 INDIRECT_GREFS(info->max_indirect_segments) :
> +                 BLKIF_MAX_SEGMENTS_PER_REQUEST;
> +
> +     /* Check if we have enough grants to allocate a requests */
> +     if (info->persistent_gnts_c < max_grefs) {
>               new_persistent_gnts = 1;
>               if (gnttab_alloc_grant_references(
> -                 BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c,
> +                 max_grefs - info->persistent_gnts_c,
>                   &gref_head) < 0) {
>                       gnttab_request_free_callback(
>                               &info->callback,
>                               blkif_restart_queue_callback,
>                               info,
> -                             BLKIF_MAX_SEGMENTS_PER_REQUEST);
> +                             max_grefs);
>                       return 1;
>               }
>       } else
> @@ -394,42 +429,82 @@ static int blkif_queue_request(struct request *req)
>       id = get_id_from_freelist(info);
>       info->shadow[id].request = req;
>  
> -     ring_req->u.rw.id = id;
> -     ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
> -     ring_req->u.rw.handle = info->handle;
> -
> -     ring_req->operation = rq_data_dir(req) ?
> -             BLKIF_OP_WRITE : BLKIF_OP_READ;
> -
> -     if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
> -             /*
> -              * Ideally we can do an unordered flush-to-disk. In case the
> -              * backend onlysupports barriers, use that. A barrier request
> -              * a superset of FUA, so we can implement it the same
> -              * way.  (It's also a FLUSH+FUA, since it is
> -              * guaranteed ordered WRT previous writes.)
> -              */
> -             ring_req->operation = info->flush_op;
> -     }
> -
>       if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
>               /* id, sector_number and handle are set above. */
>               ring_req->operation = BLKIF_OP_DISCARD;
>               ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
> +             ring_req->u.discard.id = id;
> +             ring_req->u.discard.sector_number =
> +                     (blkif_sector_t)blk_rq_pos(req);
>               if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
>                       ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
>               else
>                       ring_req->u.discard.flag = 0;
>       } else {
> -             ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req,
> -                                                        info->sg);
> -             BUG_ON(ring_req->u.rw.nr_segments >
> -                    BLKIF_MAX_SEGMENTS_PER_REQUEST);
> -
> -             for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) {
> +             BUG_ON(info->max_indirect_segments == 0 &&
> +                    req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
> +             BUG_ON(info->max_indirect_segments &&
> +                    req->nr_phys_segments > info->max_indirect_segments);
> +             nseg = blk_rq_map_sg(req->q, req, info->sg);
> +             if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
> +                     /* Indirect OP */
> +                     ring_req->operation = BLKIF_OP_INDIRECT;
> +                     ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
> +                             BLKIF_OP_WRITE : BLKIF_OP_READ;
> +                     ring_req->u.indirect.id = id;
> +                     ring_req->u.indirect.sector_number =
> +                             (blkif_sector_t)blk_rq_pos(req);
> +                     ring_req->u.indirect.handle = info->handle;
> +                     if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
> +             /*
> +              * Ideally we can do an unordered flush-to-disk. In case the
> +              * backend onlysupports barriers, use that. A barrier request
> +              * a superset of FUA, so we can implement it the same
> +              * way.  (It's also a FLUSH+FUA, since it is
> +              * guaranteed ordered WRT previous writes.)
> +              */
> +                             ring_req->u.indirect.indirect_op =
> +                                     info->flush_op;
> +                     }
> +                     ring_req->u.indirect.nr_segments = nseg;
> +             } else {
> +                     ring_req->u.rw.id = id;
> +                     ring_req->u.rw.sector_number =
> +                             (blkif_sector_t)blk_rq_pos(req);
> +                     ring_req->u.rw.handle = info->handle;
> +                     ring_req->operation = rq_data_dir(req) ?
> +                             BLKIF_OP_WRITE : BLKIF_OP_READ;
> +                     if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
> +             /*
> +              * Ideally we can do an unordered flush-to-disk. In case the
> +              * backend onlysupports barriers, use that. A barrier request
> +              * a superset of FUA, so we can implement it the same
> +              * way.  (It's also a FLUSH+FUA, since it is
> +              * guaranteed ordered WRT previous writes.)
> +              */
> +                             ring_req->operation = info->flush_op;
> +                     }
> +                     ring_req->u.rw.nr_segments = nseg;
> +             }
> +             for_each_sg(info->sg, sg, nseg, i) {
>                       fsect = sg->offset >> 9;
>                       lsect = fsect + (sg->length >> 9) - 1;
>  
> +                     if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
> +                         (i % SEGS_PER_INDIRECT_FRAME == 0)) {
> +                             if (segments)
> +                                     kunmap_atomic(segments);
> +
> +                             n = i / SEGS_PER_INDIRECT_FRAME;
> +                             gnt_list_entry = get_grant(&gref_head, info);
> +                             info->shadow[id].indirect_grants[n] =
> +                                     gnt_list_entry;
> +                             segments = kmap_atomic(
> +                                     pfn_to_page(gnt_list_entry->pfn));
> +                             ring_req->u.indirect.indirect_grefs[n] =
> +                                     gnt_list_entry->gref;
> +                     }
> +
>                       gnt_list_entry = get_grant(&gref_head, info);
>                       ref = gnt_list_entry->gref;
>  
> @@ -461,13 +536,23 @@ static int blkif_queue_request(struct request *req)
>                               kunmap_atomic(bvec_data);
>                               kunmap_atomic(shared_data);
>                       }
> -
> -                     ring_req->u.rw.seg[i] =
> -                                     (struct blkif_request_segment) {
> -                                             .gref       = ref,
> -                                             .first_sect = fsect,
> -                                             .last_sect  = lsect };
> +                     if (ring_req->operation != BLKIF_OP_INDIRECT) {
> +                             ring_req->u.rw.seg[i] =
> +                                             (struct blkif_request_segment) {
> +                                                     .gref       = ref,
> +                                                     .first_sect = fsect,
> +                                                     .last_sect  = lsect };
> +                     } else {
> +                             n = i % SEGS_PER_INDIRECT_FRAME;
> +                             segments[n] =
> +                                     (struct blkif_request_segment_aligned) {
> +                                                     .gref       = ref,
> +                                                     .first_sect = fsect,
> +                                                     .last_sect  = lsect };
> +                     }
>               }
> +             if (segments)
> +                     kunmap_atomic(segments);
>       }
>  
>       info->ring.req_prod_pvt++;
> @@ -542,7 +627,8 @@ wait:
>               flush_requests(info);
>  }
>  
> -static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
> +static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
> +                                unsigned int segments)
>  {
>       struct request_queue *rq;
>       struct blkfront_info *info = gd->private_data;
> @@ -571,7 +657,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 
> sector_size)
>       blk_queue_max_segment_size(rq, PAGE_SIZE);
>  
>       /* Ensure a merged request will fit in a single I/O ring slot. */
> -     blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
> +     blk_queue_max_segments(rq, segments);
>  
>       /* Make sure buffer addresses are sector-aligned. */
>       blk_queue_dma_alignment(rq, 511);
> @@ -588,13 +674,14 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 
> sector_size)
>  static void xlvbd_flush(struct blkfront_info *info)
>  {
>       blk_queue_flush(info->rq, info->feature_flush);
> -     printk(KERN_INFO "blkfront: %s: %s: %s %s\n",
> +     printk(KERN_INFO "blkfront: %s: %s: %s %s %s\n",
>              info->gd->disk_name,
>              info->flush_op == BLKIF_OP_WRITE_BARRIER ?
>               "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
>               "flush diskcache" : "barrier or flush"),
>              info->feature_flush ? "enabled" : "disabled",
> -            info->feature_persistent ? "using persistent grants" : "");
> +            info->feature_persistent ? "using persistent grants" : "",
> +            info->max_indirect_segments ? "using indirect descriptors" : "");
>  }
>  
>  static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
> @@ -734,7 +821,9 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
>       gd->driverfs_dev = &(info->xbdev->dev);
>       set_capacity(gd, capacity);
>  
> -     if (xlvbd_init_blk_queue(gd, sector_size)) {
> +     if (xlvbd_init_blk_queue(gd, sector_size,
> +                              info->max_indirect_segments ? :
> +                              BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
>               del_gendisk(gd);
>               goto release;
>       }
> @@ -818,6 +907,7 @@ static void blkif_free(struct blkfront_info *info, int 
> suspend)
>  {
>       struct grant *persistent_gnt;
>       struct grant *n;
> +     int i, j, segs;
>  
>       /* Prevent new requests being issued until we fix things up. */
>       spin_lock_irq(&info->io_lock);
> @@ -843,6 +933,47 @@ static void blkif_free(struct blkfront_info *info, int 
> suspend)
>       }
>       BUG_ON(info->persistent_gnts_c != 0);
>  
> +     kfree(info->sg);
> +     info->sg = NULL;
> +     for (i = 0; i < BLK_RING_SIZE; i++) {
> +             /*
> +              * Clear persistent grants present in requests already
> +              * on the shared ring
> +              */
> +             if (!info->shadow[i].request)
> +                     goto free_shadow;
> +
> +             segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
> +                    info->shadow[i].req.u.indirect.nr_segments :
> +                    info->shadow[i].req.u.rw.nr_segments;
> +             for (j = 0; j < segs; j++) {
> +                     persistent_gnt = info->shadow[i].grants_used[j];
> +                     gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
> +                     __free_page(pfn_to_page(persistent_gnt->pfn));
> +                     kfree(persistent_gnt);
> +             }
> +
> +             if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
> +                     /*
> +                      * If this is not an indirect operation don't try to
> +                      * free indirect segments
> +                      */
> +                     goto free_shadow;
> +
> +             for (j = 0; j < INDIRECT_GREFS(segs); j++) {
> +                     persistent_gnt = info->shadow[i].indirect_grants[j];
> +                     gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
> +                     __free_page(pfn_to_page(persistent_gnt->pfn));
> +                     kfree(persistent_gnt);
> +             }
> +
> +free_shadow:
> +             kfree(info->shadow[i].grants_used);
> +             info->shadow[i].grants_used = NULL;
> +             kfree(info->shadow[i].indirect_grants);
> +             info->shadow[i].indirect_grants = NULL;
> +     }
> +
>       /* No more gnttab callback work. */
>       gnttab_cancel_free_callback(&info->callback);
>       spin_unlock_irq(&info->io_lock);
> @@ -873,6 +1004,10 @@ static void blkif_completion(struct blk_shadow *s, 
> struct blkfront_info *info,
>       char *bvec_data;
>       void *shared_data;
>       unsigned int offset = 0;
> +     int nseg;
> +
> +     nseg = s->req.operation == BLKIF_OP_INDIRECT ?
> +             s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
>  
>       if (bret->operation == BLKIF_OP_READ) {
>               /*
> @@ -885,7 +1020,7 @@ static void blkif_completion(struct blk_shadow *s, 
> struct blkfront_info *info,
>                       BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE);
>                       if (bvec->bv_offset < offset)
>                               i++;
> -                     BUG_ON(i >= s->req.u.rw.nr_segments);
> +                     BUG_ON(i >= nseg);
>                       shared_data = kmap_atomic(
>                               pfn_to_page(s->grants_used[i]->pfn));
>                       bvec_data = bvec_kmap_irq(bvec, &flags);
> @@ -897,10 +1032,17 @@ static void blkif_completion(struct blk_shadow *s, 
> struct blkfront_info *info,
>               }
>       }
>       /* Add the persistent grant into the list of free grants */
> -     for (i = 0; i < s->req.u.rw.nr_segments; i++) {
> +     for (i = 0; i < nseg; i++) {
>               list_add(&s->grants_used[i]->node, &info->persistent_gnts);
>               info->persistent_gnts_c++;
>       }
> +     if (s->req.operation == BLKIF_OP_INDIRECT) {
> +             for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
> +                     list_add(&s->indirect_grants[i]->node,
> +                              &info->persistent_gnts);
> +                     info->persistent_gnts_c++;
> +             }
> +     }
>  }
>  
>  static irqreturn_t blkif_interrupt(int irq, void *dev_id)
> @@ -1034,8 +1176,6 @@ static int setup_blkring(struct xenbus_device *dev,
>       SHARED_RING_INIT(sring);
>       FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
>  
> -     sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
> -
>       err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
>       if (err < 0) {
>               free_page((unsigned long)sring);
> @@ -1116,12 +1256,6 @@ again:
>               goto destroy_blkring;
>       }
>  
> -     /* Allocate memory for grants */
> -     err = fill_grant_buffer(info, BLK_RING_SIZE *
> -                                   BLKIF_MAX_SEGMENTS_PER_REQUEST);
> -     if (err)
> -             goto out;
> -
>       xenbus_switch_state(dev, XenbusStateInitialised);
>  
>       return 0;
> @@ -1223,13 +1357,84 @@ static int blkfront_probe(struct xenbus_device *dev,
>       return 0;
>  }
>  
> +/*
> + * This is a clone of md_trim_bio, used to split a bio into smaller ones
> + */
> +static void trim_bio(struct bio *bio, int offset, int size)
> +{
> +     /* 'bio' is a cloned bio which we need to trim to match
> +      * the given offset and size.
> +      * This requires adjusting bi_sector, bi_size, and bi_io_vec
> +      */
> +     int i;
> +     struct bio_vec *bvec;
> +     int sofar = 0;
> +
> +     size <<= 9;
> +     if (offset == 0 && size == bio->bi_size)
> +             return;
> +
> +     bio->bi_sector += offset;
> +     bio->bi_size = size;
> +     offset <<= 9;
> +     clear_bit(BIO_SEG_VALID, &bio->bi_flags);
> +
> +     while (bio->bi_idx < bio->bi_vcnt &&
> +            bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
> +             /* remove this whole bio_vec */
> +             offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
> +             bio->bi_idx++;
> +     }
> +     if (bio->bi_idx < bio->bi_vcnt) {
> +             bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
> +             bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
> +     }
> +     /* avoid any complications with bi_idx being non-zero*/
> +     if (bio->bi_idx) {
> +             memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
> +                     (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
> +             bio->bi_vcnt -= bio->bi_idx;
> +             bio->bi_idx = 0;
> +     }
> +     /* Make sure vcnt and last bv are not too big */
> +     bio_for_each_segment(bvec, bio, i) {
> +             if (sofar + bvec->bv_len > size)
> +                     bvec->bv_len = size - sofar;
> +             if (bvec->bv_len == 0) {
> +                     bio->bi_vcnt = i;
> +                     break;
> +             }
> +             sofar += bvec->bv_len;
> +     }
> +}
> +
> +static void split_bio_end(struct bio *bio, int error)
> +{
> +     struct split_bio *split_bio = bio->bi_private;
> +
> +     if (error)
> +             split_bio->err = error;
> +
> +     if (atomic_dec_and_test(&split_bio->pending)) {
> +             split_bio->bio->bi_phys_segments = 0;
> +             bio_endio(split_bio->bio, split_bio->err);
> +             kfree(split_bio);
> +     }
> +     bio_put(bio);
> +}
>  
>  static int blkif_recover(struct blkfront_info *info)
>  {
>       int i;
> -     struct blkif_request *req;
> +     struct request *req, *n;
>       struct blk_shadow *copy;
> -     int j;
> +     int rc;
> +     struct bio *bio, *cloned_bio;
> +     struct bio_list bio_list, merge_bio;
> +     unsigned int segs;
> +     int pending, offset, size;
> +     struct split_bio *split_bio;
> +     struct list_head requests;
>  
>       /* Stage 1: Make a safe copy of the shadow state. */
>       copy = kmalloc(sizeof(info->shadow),
> @@ -1245,36 +1450,64 @@ static int blkif_recover(struct blkfront_info *info)
>       info->shadow_free = info->ring.req_prod_pvt;
>       info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
>  
> -     /* Stage 3: Find pending requests and requeue them. */
> +     rc = blkfront_setup_indirect(info);
> +     if (rc) {
> +             kfree(copy);
> +             return rc;
> +     }
> +
> +     segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
> +     blk_queue_max_segments(info->rq, segs);
> +     bio_list_init(&bio_list);
> +     INIT_LIST_HEAD(&requests);
>       for (i = 0; i < BLK_RING_SIZE; i++) {
>               /* Not in use? */
>               if (!copy[i].request)
>                       continue;
>  
> -             /* Grab a request slot and copy shadow state into it. */
> -             req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
> -             *req = copy[i].req;
> -
> -             /* We get a new request id, and must reset the shadow state. */
> -             req->u.rw.id = get_id_from_freelist(info);
> -             memcpy(&info->shadow[req->u.rw.id], &copy[i], sizeof(copy[i]));
> -
> -             if (req->operation != BLKIF_OP_DISCARD) {
> -             /* Rewrite any grant references invalidated by susp/resume. */
> -                     for (j = 0; j < req->u.rw.nr_segments; j++)
> -                             gnttab_grant_foreign_access_ref(
> -                                     req->u.rw.seg[j].gref,
> -                                     info->xbdev->otherend_id,
> -                                     pfn_to_mfn(copy[i].grants_used[j]->pfn),
> -                                     0);
> +             /*
> +              * Get the bios in the request so we can re-queue them.
> +              */
> +             if (copy[i].request->cmd_flags &
> +                 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
> +                     /*
> +                      * Flush operations don't contain bios, so
> +                      * we need to requeue the whole request
> +                      */
> +                     list_add(&copy[i].request->queuelist, &requests);
> +                     continue;
>               }
> -             info->shadow[req->u.rw.id].req = *req;
> -
> -             info->ring.req_prod_pvt++;
> +             merge_bio.head = copy[i].request->bio;
> +             merge_bio.tail = copy[i].request->biotail;
> +             bio_list_merge(&bio_list, &merge_bio);
> +             copy[i].request->bio = NULL;
> +             blk_put_request(copy[i].request);
>       }
>  
>       kfree(copy);
>  
> +     /*
> +      * Empty the queue, this is important because we might have
> +      * requests in the queue with more segments than what we
> +      * can handle now.
> +      */
> +     spin_lock_irq(&info->io_lock);
> +     while ((req = blk_fetch_request(info->rq)) != NULL) {
> +             if (req->cmd_flags &
> +                 (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
> +                     list_add(&req->queuelist, &requests);
> +                     continue;
> +             }
> +             merge_bio.head = req->bio;
> +             merge_bio.tail = req->biotail;
> +             bio_list_merge(&bio_list, &merge_bio);
> +             req->bio = NULL;
> +             if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
> +                     pr_alert("diskcache flush request found!\n");
> +             __blk_put_request(info->rq, req);
> +     }
> +     spin_unlock_irq(&info->io_lock);
> +
>       xenbus_switch_state(info->xbdev, XenbusStateConnected);
>  
>       spin_lock_irq(&info->io_lock);
> @@ -1282,14 +1515,50 @@ static int blkif_recover(struct blkfront_info *info)
>       /* Now safe for us to use the shared ring */
>       info->connected = BLKIF_STATE_CONNECTED;
>  
> -     /* Send off requeued requests */
> -     flush_requests(info);
> -
>       /* Kick any other new requests queued since we resumed */
>       kick_pending_request_queues(info);
>  
> +     list_for_each_entry_safe(req, n, &requests, queuelist) {
> +             /* Requeue pending requests (flush or discard) */
> +             list_del_init(&req->queuelist);
> +             BUG_ON(req->nr_phys_segments > segs);
> +             blk_requeue_request(info->rq, req);
> +     }
>       spin_unlock_irq(&info->io_lock);
>  
> +     while ((bio = bio_list_pop(&bio_list)) != NULL) {
> +             /* Traverse the list of pending bios and re-queue them */
> +             if (bio_segments(bio) > segs) {
> +                     /*
> +                      * This bio has more segments than what we can
> +                      * handle, we have to split it.
> +                      */
> +                     pending = (bio_segments(bio) + segs - 1) / segs;
> +                     split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO);
> +                     BUG_ON(split_bio == NULL);
> +                     atomic_set(&split_bio->pending, pending);
> +                     split_bio->bio = bio;
> +                     for (i = 0; i < pending; i++) {
> +                             offset = (i * segs * PAGE_SIZE) >> 9;
> +                             size = MIN((segs * PAGE_SIZE) >> 9,
> +                                        (bio->bi_size >> 9) - offset);
> +                             cloned_bio = bio_clone(bio, GFP_NOIO);
> +                             BUG_ON(cloned_bio == NULL);
> +                             trim_bio(cloned_bio, offset, size);
> +                             cloned_bio->bi_private = split_bio;
> +                             cloned_bio->bi_end_io = split_bio_end;
> +                             submit_bio(cloned_bio->bi_rw, cloned_bio);
> +                     }
> +                     /*
> +                      * Now we have to wait for all those smaller bios to
> +                      * end, so we can also end the "parent" bio.
> +                      */
> +                     continue;
> +             }
> +             /* We don't need to split this bio */
> +             submit_bio(bio->bi_rw, bio);
> +     }
> +
>       return 0;
>  }
>  
> @@ -1309,8 +1578,12 @@ static int blkfront_resume(struct xenbus_device *dev)
>       blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
>  
>       err = talk_to_blkback(dev, info);
> -     if (info->connected == BLKIF_STATE_SUSPENDED && !err)
> -             err = blkif_recover(info);
> +
> +     /*
> +      * We have to wait for the backend to switch to
> +      * connected state, since we want to read which
> +      * features it supports.
> +      */
>  
>       return err;
>  }
> @@ -1388,6 +1661,62 @@ static void blkfront_setup_discard(struct 
> blkfront_info *info)
>       kfree(type);
>  }
>  
> +static int blkfront_setup_indirect(struct blkfront_info *info)
> +{
> +     unsigned int indirect_segments, segs;
> +     int err, i;
> +
> +     err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
> +                         "max-indirect-segments", "%u", &indirect_segments,
> +                         NULL);
> +     if (err) {
> +             info->max_indirect_segments = 0;
> +             segs = BLKIF_MAX_SEGMENTS_PER_REQUEST;
> +     } else {
> +             info->max_indirect_segments = MIN(indirect_segments,
> +                                               xen_blkif_max_segments);
> +             segs = info->max_indirect_segments;
> +     }
> +     info->sg = kzalloc(sizeof(info->sg[0]) * segs, GFP_KERNEL);
> +     if (info->sg == NULL)
> +             goto out_of_memory;
> +     sg_init_table(info->sg, segs);
> +
> +     err = fill_grant_buffer(info,
> +                             (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE);
> +     if (err)
> +             goto out_of_memory;
> +
> +     for (i = 0; i < BLK_RING_SIZE; i++) {
> +             info->shadow[i].grants_used = kzalloc(
> +                     sizeof(info->shadow[i].grants_used[0]) * segs,
> +                     GFP_NOIO);
> +             if (info->max_indirect_segments)
> +                     info->shadow[i].indirect_grants = kzalloc(
> +                             sizeof(info->shadow[i].indirect_grants[0]) *
> +                             INDIRECT_GREFS(segs),
> +                             GFP_NOIO);
> +             if ((info->shadow[i].grants_used == NULL) ||
> +                  (info->max_indirect_segments &&
> +                  (info->shadow[i].indirect_grants == NULL)))
> +                     goto out_of_memory;
> +     }
> +
> +
> +     return 0;
> +
> +out_of_memory:
> +     kfree(info->sg);
> +     info->sg = NULL;
> +     for (i = 0; i < BLK_RING_SIZE; i++) {
> +             kfree(info->shadow[i].grants_used);
> +             info->shadow[i].grants_used = NULL;
> +             kfree(info->shadow[i].indirect_grants);
> +             info->shadow[i].indirect_grants = NULL;
> +     }
> +     return -ENOMEM;
> +}
> +
>  /*
>   * Invoked when the backend is finally 'ready' (and has told produced
>   * the details about the physical device - #sectors, size, etc).
> @@ -1415,8 +1744,9 @@ static void blkfront_connect(struct blkfront_info *info)
>               set_capacity(info->gd, sectors);
>               revalidate_disk(info->gd);
>  
> -             /* fall through */
> +             return;
>       case BLKIF_STATE_SUSPENDED:
> +             blkif_recover(info);
>               return;
>  
>       default:
> @@ -1437,6 +1767,7 @@ static void blkfront_connect(struct blkfront_info *info)
>                                info->xbdev->otherend);
>               return;
>       }
> +     info->sector_size = sector_size;
>  
>       info->feature_flush = 0;
>       info->flush_op = 0;
> @@ -1484,6 +1815,13 @@ static void blkfront_connect(struct blkfront_info 
> *info)
>       else
>               info->feature_persistent = persistent;
>  
> +     err = blkfront_setup_indirect(info);
> +     if (err) {
> +             xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
> +                              info->xbdev->otherend);
> +             return;
> +     }
> +
>       err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
>       if (err) {
>               xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
> diff --git a/include/xen/interface/io/blkif.h 
> b/include/xen/interface/io/blkif.h
> index 01c3d62..6d99849 100644
> --- a/include/xen/interface/io/blkif.h
> +++ b/include/xen/interface/io/blkif.h
> @@ -102,6 +102,8 @@ typedef uint64_t blkif_sector_t;
>   */
>  #define BLKIF_OP_DISCARD           5
>  
> +#define BLKIF_OP_INDIRECT          6
> +
>  /*
>   * Maximum scatter/gather segments per request.
>   * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
> @@ -109,6 +111,16 @@ typedef uint64_t blkif_sector_t;
>   */
>  #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
>  
> +#define BLKIF_MAX_INDIRECT_GREFS_PER_REQUEST 8
> +
> +struct blkif_request_segment_aligned {
> +     grant_ref_t gref;        /* reference to I/O buffer frame        */
> +     /* @first_sect: first sector in frame to transfer (inclusive).   */
> +     /* @last_sect: last sector in frame to transfer (inclusive).     */
> +     uint8_t     first_sect, last_sect;
> +     uint16_t    _pad; /* padding to make it 8 bytes, so it's cache-aligned 
> */
> +} __attribute__((__packed__));
> +
>  struct blkif_request_rw {
>       uint8_t        nr_segments;  /* number of segments                   */
>       blkif_vdev_t   handle;       /* only for read/write requests         */
> @@ -138,11 +150,24 @@ struct blkif_request_discard {
>       uint8_t        _pad3;
>  } __attribute__((__packed__));
>  
> +struct blkif_request_indirect {
> +     uint8_t        indirect_op;
> +     uint16_t       nr_segments;
> +#ifdef CONFIG_X86_64
> +     uint32_t       _pad1;        /* offsetof(blkif_...,u.indirect.id) == 8 
> */
> +#endif
> +     uint64_t       id;
> +     blkif_vdev_t   handle;
> +     blkif_sector_t sector_number;
> +     grant_ref_t    indirect_grefs[BLKIF_MAX_INDIRECT_GREFS_PER_REQUEST];
> +} __attribute__((__packed__));
> +
>  struct blkif_request {
>       uint8_t        operation;    /* BLKIF_OP_???                         */
>       union {
>               struct blkif_request_rw rw;
>               struct blkif_request_discard discard;
> +             struct blkif_request_indirect indirect;
>       } u;
>  } __attribute__((__packed__));
>  
> -- 
> 1.7.7.5 (Apple Git-26)
> 

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.