[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [RFC] Support of non-indirect grant backend on 64KB guest



Hi,

Firstly, this patch is not ready at all and mostly here for collecting comment 
about the way to do it. It's not clean so no need to complain about the coding 
style.

The qdisk backend in QEMU is not supporting indirect grant, this is means that 
a request can only support 11 * 4KB = 44KB.

When using 64KB page, a Linux block request (struct *request) may contain up to 
64KB of data. This is because the block segment size must at least be the size 
of a Linux page.

So when indirect is not supported by the backend, we are not able to fit all 
the data in a single request. We therefore need to create a second request to 
copy the rest of the data.

I've wrote a patch last week which make 64KB guest booting with qdisk. 
Although, I'm not sure this is the right way to do it. I would appreciate if 
one of the block maintainers give me insight about it.

The patch can be found below.

Regards,

commit 62922ae04af371bcb6e4467eb2e470d83dac2a81
Author: Julien Grall <julien.grall@xxxxxxxxxx>
Date:   Thu Aug 13 13:13:35 2015 +0100

    blkfront: Start to handle non-indirect grant

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 68ca4e5..76247ab 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -79,6 +79,13 @@ struct blk_shadow {
        struct blk_grant **indirect_grants;
        struct scatterlist *sg;
        unsigned int num_sg;
+       enum
+       {
+               REQ_WAITING,
+               REQ_DONE,
+               REQ_FAIL
+       } status;
+       unsigned long associated_id;
 };
 
 struct split_bio {
@@ -467,6 +474,7 @@ static unsigned long blkif_ring_get_request(struct 
blkfront_info *info,
 
        id = get_id_from_freelist(info);
        info->shadow[id].request = req;
+       info->shadow[id].status = REQ_WAITING;
 
        (*ring_req)->u.rw.id = id;
 
@@ -508,6 +516,9 @@ struct setup_rw_req {
        bool need_copy;
        unsigned int bvec_off;
        char *bvec_data;
+
+       bool require_extra_req;
+       struct blkif_request *ring_req2;
 };
 
 static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
@@ -517,12 +528,20 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, 
unsigned int offset,
        int n, ref;
        struct blk_grant *gnt_list_entry;
        unsigned int fsect, lsect;
+       struct blkif_request *ring_req;
        /* Convenient aliases */
        unsigned int grant_idx = setup->grant_idx;
-       struct blkif_request *ring_req = setup->ring_req;
        struct blkfront_info *info = setup->info;
        struct blk_shadow *shadow = &info->shadow[setup->id];
 
+       if (likely(!setup->require_extra_req ||
+                  grant_idx < BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+               ring_req = setup->ring_req;
+       } else {
+               grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
+               ring_req = setup->ring_req2;
+       }
+
        if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
            (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
                if (setup->segments)
@@ -537,7 +556,7 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, 
unsigned int offset,
 
        gnt_list_entry = get_grant(&setup->gref_head, gfn, info);
        ref = gnt_list_entry->gref;
-       shadow->grants_used[grant_idx] = gnt_list_entry;
+       shadow->grants_used[setup->grant_idx] = gnt_list_entry;
 
        if (setup->need_copy) {
                void *shared_data;
@@ -579,11 +598,31 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, 
unsigned int offset,
        (setup->grant_idx)++;
 }
 
+static void blkif_setup_extra_req(struct blkif_request *first,
+                                 struct blkif_request *second)
+{
+       uint16_t nr_segments = first->u.rw.nr_segments;
+
+
+       /* The second request is only present when the first request uses
+        * all its segments. It's always the continuity of the first one
+        */
+       first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+       second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST;
+       second->u.rw.sector_number = first->u.rw.sector_number +
+               (BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512;
+
+       second->u.rw.handle = first->u.rw.handle;
+       second->operation = first->operation;
+}
+
 static int blkif_queue_rw_req(struct request *req)
 {
        struct blkfront_info *info = req->rq_disk->private_data;
-       struct blkif_request *ring_req;
-       unsigned long id;
+       struct blkif_request *ring_req, *ring_req2 = NULL;
+       unsigned long id, id2 = ~0;
+       bool require_extra_req = false;
        int i;
        struct setup_rw_req setup = {
                .grant_idx = 0,
@@ -628,19 +667,28 @@ static int blkif_queue_rw_req(struct request *req)
        /* Fill out a communications ring structure. */
        id = blkif_ring_get_request(info, req, &ring_req);
 
-       BUG_ON(info->max_indirect_segments == 0 &&
-              GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST);
-       BUG_ON(info->max_indirect_segments &&
-              GREFS(req->nr_phys_segments) > info->max_indirect_segments);
-
        num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
        num_grant = 0;
        /* Calculate the number of grant used */
        for_each_sg(info->shadow[id].sg, sg, num_sg, i)
               num_grant += gnttab_count_grant(sg->offset, sg->length);
 
+       require_extra_req = info->max_indirect_segments == 0 &&
+               num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST;
+       BUG_ON((XEN_PAGE_SIZE == PAGE_SIZE) && require_extra_req);
+
+       if (unlikely(require_extra_req))
+       {
+               id2 = blkif_ring_get_request(info, req, &ring_req2);
+               info->shadow[id2].num_sg = 0;
+               info->shadow[id2].associated_id = id;
+       }
+
+       info->shadow[id].associated_id = id2;
+
        info->shadow[id].num_sg = num_sg;
-       if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+       if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
+           likely(!require_extra_req)) {
                /*
                 * The indirect operation can only be a BLKIF_OP_READ or
                 * BLKIF_OP_WRITE
@@ -680,10 +728,17 @@ static int blkif_queue_rw_req(struct request *req)
                        }
                }
                ring_req->u.rw.nr_segments = num_grant;
+               if (unlikely(require_extra_req))
+                       blkif_setup_extra_req(ring_req, ring_req2);
        }
 
        setup.ring_req = ring_req;
        setup.id = id;
+
+       setup.require_extra_req = require_extra_req;
+       if (unlikely(require_extra_req))
+               setup.ring_req2 = ring_req2;
+
        for_each_sg(info->shadow[id].sg, sg, num_sg, i) {
                BUG_ON(sg->offset + sg->length > PAGE_SIZE);
 
@@ -706,6 +761,8 @@ static int blkif_queue_rw_req(struct request *req)
 
        /* Keep a private copy so we can reissue requests when recovering. */
        info->shadow[id].req = *ring_req;
+       if (unlikely(require_extra_req))
+               info->shadow[id2].req = *ring_req2;
 
        if (new_persistent_gnts)
                gnttab_free_grant_references(setup.gref_head);
@@ -797,7 +854,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 
sector_size,
        memset(&info->tag_set, 0, sizeof(info->tag_set));
        info->tag_set.ops = &blkfront_mq_ops;
        info->tag_set.nr_hw_queues = 1;
-       info->tag_set.queue_depth =  BLK_RING_SIZE(info);
+       info->tag_set.queue_depth =  BLK_RING_SIZE(info) / 2;
        info->tag_set.numa_node = NUMA_NO_NODE;
        info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
        info->tag_set.cmd_size = 0;
@@ -822,6 +879,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 
sector_size,
                        queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq);
        }
 
+
        /* Hard sector size and max sectors impersonate the equiv. hardware. */
        blk_queue_logical_block_size(rq, sector_size);
        blk_queue_physical_block_size(rq, physical_sector_size);
@@ -1229,7 +1287,21 @@ static void blkif_completion(struct blk_shadow *s, 
struct blkfront_info *info,
        };
 
        num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
-               s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
+                       s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
+
+       if (unlikely(s->associated_id != ~0)) {
+               struct blk_shadow *s2 = &info->shadow[s->associated_id];
+               BUG_ON(s->req.operation == BLKIF_OP_INDIRECT);
+
+               num_grant += s2->req.u.rw.nr_segments;
+
+               /* Only the first request can have sg != 0 */
+               if (s2->num_sg != 0) {
+                       data.s = s2;
+                       s = s2;
+               }
+       }
+
        num_sg = s->num_sg;
 
        if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
@@ -1248,6 +1320,7 @@ static void blkif_completion(struct blk_shadow *s, struct 
blkfront_info *info,
                        kunmap_atomic(data.bvec_data);
                }
        }
+
        /* Add the persistent grant into the list of free grants */
        for (i = 0; i < num_grant; i++) {
                if (gnttab_query_foreign_access(s->grants_used[i]->gref)) {
@@ -1337,9 +1410,22 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                }
                req  = info->shadow[id].request;
 
-               if (bret->operation != BLKIF_OP_DISCARD)
+               if (bret->operation != BLKIF_OP_DISCARD) {
+                       unsigned long id2 = info->shadow[id].associated_id;
+
+                       if (unlikely(id2 != ~0)) {
+                               info->shadow[id].status = (bret->status == 
BLKIF_RSP_OKAY) ? REQ_DONE : REQ_FAIL;
+
+                               if (info->shadow[id2].status == REQ_WAITING)
+                                       continue;
+                       }
+
                        blkif_completion(&info->shadow[id], info, bret);
 
+                       if (unlikely(id2 != ~0))
+                               BUG_ON(add_id_to_freelist(info, id2));
+               }
+
                if (add_id_to_freelist(info, id)) {
                        WARN(1, "%s: response to %s (id %ld) couldn't be 
recycled!\n",
                             info->gd->disk_name, op_name(bret->operation), id);
@@ -1874,7 +1960,13 @@ static int blkfront_setup_indirect(struct blkfront_info 
*info)
                                                  xen_blkif_max_segments);
                grants = info->max_indirect_segments;
        }
+
        psegs = grants / GRANTS_PER_PSEG;
+       if (!psegs)
+       {
+               psegs = 1;
+               grants = GRANTS_PER_PSEG;
+       }
 
        err = fill_grant_buffer(info,
                                (grants + INDIRECT_GREFS(grants)) * 
BLK_RING_SIZE(info));

-- 
Julien Grall

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.