[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] xen: add persistent grants to xbd_xenbus



This patch implements persistent grants for xbd_xenbus (blkfront in
Linux terminology). The effect of this change is to reduce the number
of unmap operations performed, since they cause a (costly) TLB
shootdown. This allows the I/O performance to scale better when a
large number of VMs are performing I/O.

On startup xbd_xenbus notifies the backend driver that it is using
persistent grants. If the backend driver is not capable of persistent
mapping, xbd_xenbus will still use the same grants, since it is
compatible with the previous protocol, and simplifies the code
complexity in xbd_xenbus.

Each time a request is send to the backend driver, xbd_xenbus will
check if there are free grants already mapped and will try to use one
of those. If there are not enough grants already mapped, xbd_xenbus
will request new grants, and they will be added to the list of free
grants when the transaction is finished, so they can be reused. Data
has to be copied from the request (struct buf) to the mapped grant, or
from the mapped grant to the request, depending on the operation being
performed.

To test the performance impact of this patch I've benchmarked the
number of IOPS performed simultaneously by 15 NetBSD DomU guests on a
Linux Dom0 that supports persistent grants. The backend used to
perform this tests was a ramdisk of size 1GB.

                     Sum of IOPS
Non-persistent            336718
Persistent                686010

As seen, there's a x2 increase in the total number of IOPS being
performed. As a reference, using exactly the same setup Linux DomUs
are able to achieve 1102019 IOPS, so there's still plenty of room for
improvement.

As said previously this implementation only covers for the NetBSD
front-end driver.

Cc: xen-devel@xxxxxxxxxxxxx
---
 sys/arch/xen/xen/xbd_xenbus.c |  174 ++++++++++++++++++++++++----------------
 1 files changed, 104 insertions(+), 70 deletions(-)

diff --git a/sys/arch/xen/xen/xbd_xenbus.c b/sys/arch/xen/xen/xbd_xenbus.c
index a9221e7..4ece0b0 100644
--- a/sys/arch/xen/xen/xbd_xenbus.c
+++ b/sys/arch/xen/xen/xbd_xenbus.c
@@ -68,6 +68,7 @@ __KERNEL_RCSID(0, "$NetBSD: xbd_xenbus.c,v 1.57 2012/05/25 
15:03:38 elric Exp $"
 #include <sys/systm.h>
 #include <sys/stat.h>
 #include <sys/vnode.h>
+#include <sys/malloc.h>
 
 #include <dev/dkvar.h>
 
@@ -99,12 +100,18 @@ __KERNEL_RCSID(0, "$NetBSD: xbd_xenbus.c,v 1.57 2012/05/25 
15:03:38 elric Exp $"
 #define XEN_BSHIFT      9               /* log2(XEN_BSIZE) */
 #define XEN_BSIZE       (1 << XEN_BSHIFT) 
 
+struct grant {
+       SLIST_ENTRY(grant) gnt_next;
+       grant_ref_t gref;
+       vaddr_t va;
+};
+
 struct xbd_req {
        SLIST_ENTRY(xbd_req) req_next;
        uint16_t req_id; /* ID passed to backend */
        union {
            struct {
-               grant_ref_t req_gntref[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+               struct grant *req_gntref[BLKIF_MAX_SEGMENTS_PER_REQUEST];
                int req_nr_segments; /* number of segments in this request */
                struct buf *req_bp; /* buffer associated with this request */
                void *req_data; /* pointer to the data buffer */
@@ -152,6 +159,8 @@ struct xbd_xenbus_softc {
        u_long sc_handle; /* from backend */
        int sc_cache_flush; /* backend supports BLKIF_OP_FLUSH_DISKCACHE */
        krndsource_t     sc_rnd_source;
+       SLIST_HEAD(,grant) sc_free_grants; /* list of available mapped grants */
+       int sc_persistent_grants; /* backend supports persistent grants */
 };
 
 #if 0
@@ -172,9 +181,6 @@ static int  xbdstart(struct dk_softc *, struct buf *);
 static void xbd_backend_changed(void *, XenbusState);
 static void xbd_connect(struct xbd_xenbus_softc *);
 
-static int  xbd_map_align(struct xbd_req *);
-static void xbd_unmap_align(struct xbd_req *);
-
 static void xbdminphys(struct buf *);
 
 CFATTACH_DECL3_NEW(xbd, sizeof(struct xbd_xenbus_softc),
@@ -217,6 +223,23 @@ static struct dkdriver xbddkdriver = {
        .d_minphys = xbdminphys,
 };
 
+static void
+xbd_clear_grants(struct xbd_xenbus_softc *sc)
+{
+       struct grant *entry;
+
+       /* Unmap and clear the list of persistent grants */
+       while (!SLIST_EMPTY(&sc->sc_free_grants)) {
+               entry = SLIST_FIRST(&sc->sc_free_grants);
+               SLIST_REMOVE_HEAD(&sc->sc_free_grants, gnt_next);
+               if (__predict_false(xengnt_status(entry->gref)))
+                       panic("grant still used");
+               xengnt_revoke_access(entry->gref);
+               uvm_km_kmem_free(kmem_va_arena, entry->va, PAGE_SIZE);
+               free(entry, M_DEVBUF);
+       }
+}
+
 static int
 xbd_xenbus_match(device_t parent, cfdata_t match, void *aux)
 {
@@ -285,6 +308,7 @@ xbd_xenbus_attach(device_t parent, device_t self, void *aux)
                SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, &sc->sc_reqs[i],
                    req_next);
        }
+       SLIST_INIT(&sc->sc_free_grants);
 
        sc->sc_backend_status = BLKIF_STATE_DISCONNECTED;
        sc->sc_shutdown = BLKIF_SHUTDOWN_REMOTE;
@@ -368,6 +392,8 @@ xbd_xenbus_detach(device_t dev, int flags)
                rnd_detach_source(&sc->sc_rnd_source);
        }
 
+       xbd_clear_grants(sc);
+
        hypervisor_mask_event(sc->sc_evtchn);
        event_remove_handler(sc->sc_evtchn, &xbd_handler, sc);
        while (xengnt_status(sc->sc_ring_gntref)) {
@@ -402,6 +428,8 @@ xbd_xenbus_suspend(device_t dev, const pmf_qual_t *qual) {
 
        splx(s);
 
+       xbd_clear_grants(sc);
+
        xenbus_device_suspend(sc->sc_xbusd);
        aprint_verbose_dev(dev, "removed event channel %d\n", sc->sc_evtchn);
 
@@ -475,6 +503,12 @@ again:
                errmsg = "writing protocol";
                goto abort_transaction;
        }
+       error = xenbus_printf(xbt, sc->sc_xbusd->xbusd_path,
+           "feature-persistent", "%u", 1);
+       if (error) {
+               errmsg = "writing feature-persistent";
+               goto abort_transaction;
+       }
        error = xenbus_transaction_end(xbt, 0);
        if (error == EAGAIN)
                goto again;
@@ -614,7 +648,7 @@ xbd_connect(struct xbd_xenbus_softc *sc)
 {
        int err;
        unsigned long long sectors;
-       u_long cache_flush;
+       u_long cache_flush, feature_persistent;
 
        err = xenbus_read_ul(NULL,
            sc->sc_xbusd->xbusd_path, "virtual-device", &sc->sc_handle, 10);
@@ -652,6 +686,15 @@ xbd_connect(struct xbd_xenbus_softc *sc)
        else
                sc->sc_cache_flush = 0;
 
+       err = xenbus_read_ul(NULL, sc->sc_xbusd->xbusd_otherend,
+           "feature-persistent", &feature_persistent, 10);
+       if (err)
+               feature_persistent = 0;
+       if (feature_persistent > 0)
+               sc->sc_persistent_grants = 1;
+       else
+               sc->sc_persistent_grants = 0;
+
        xenbus_switch_state(sc->sc_xbusd, NULL, XenbusStateConnected);
 }
 
@@ -662,7 +705,8 @@ xbd_handler(void *arg)
        struct buf *bp;
        RING_IDX resp_prod, i;
        int more_to_do;
-       int seg;
+       int seg, nbytes, ncopy;
+       vaddr_t va;
 
        DPRINTF(("xbd_handler(%s)\n", device_xname(sc->sc_dksc.sc_dev)));
 
@@ -684,18 +728,6 @@ again:
                        /* caller will free the req */
                        continue;
                }
-               for (seg = xbdreq->req_nr_segments - 1; seg >= 0; seg--) {
-                       if (__predict_false(
-                           xengnt_status(xbdreq->req_gntref[seg]))) {
-                               aprint_verbose_dev(sc->sc_dksc.sc_dev,
-                                       "grant still used by backend\n");
-                               sc->sc_ring.rsp_cons = i;
-                               xbdreq->req_nr_segments = seg + 1;
-                               goto done;
-                       }
-                       xengnt_revoke_access(xbdreq->req_gntref[seg]);
-                       xbdreq->req_nr_segments--;
-               }
                if (rep->operation != BLKIF_OP_READ &&
                    rep->operation != BLKIF_OP_WRITE) {
                        aprint_error_dev(sc->sc_dksc.sc_dev,
@@ -709,10 +741,20 @@ again:
                        bp->b_resid = bp->b_bcount;
                        goto next;
                }
-               /* b_resid was set in xbdstart */
+
+               if (rep->operation == BLKIF_OP_READ) {
+                       /* Read data from the request */
+                       nbytes = xbdreq->req_bp->b_bcount;
+                       va = (vaddr_t)xbdreq->req_bp->b_data;
+                       for (seg = 0; seg < xbdreq->req_nr_segments; seg++) {
+                               ncopy = nbytes > PAGE_SIZE ? PAGE_SIZE : nbytes;
+                               memcpy((void *) va, (void *) 
xbdreq->req_gntref[seg]->va,
+                                          ncopy);
+                               nbytes -= ncopy;
+                               va += ncopy;
+                       }
+               }
 next:
-               if (bp->b_data != xbdreq->req_data)
-                       xbd_unmap_align(xbdreq);
                disk_unbusy(&sc->sc_dksc.sc_dkdev,
                    (bp->b_bcount - bp->b_resid),
                    (bp->b_flags & B_READ));
@@ -720,8 +762,12 @@ next:
                    bp->b_blkno);
                biodone(bp);
                SLIST_INSERT_HEAD(&sc->sc_xbdreq_head, xbdreq, req_next);
+               /* Insert the grants back into the list of available grants */
+               for (seg = 0; seg < xbdreq->req_nr_segments; seg++)
+                       SLIST_INSERT_HEAD(&sc->sc_free_grants,
+                                         xbdreq->req_gntref[seg], gnt_next);
        }
-done:
+
        xen_rmb();
        sc->sc_ring.rsp_cons = i;
 
@@ -944,9 +990,11 @@ xbdstart(struct dk_softc *dksc, struct buf *bp)
        int ret = 0, runqueue = 1;
        size_t bcount, off;
        paddr_t ma;
-       vaddr_t va;
-       int nsects, nbytes, seg;
+       vaddr_t va = 0;
+       vaddr_t va_newmap;
+       int nsects, nbytes, seg, s, rc;
        int notify;
+       struct grant *gnt;
 
        DPRINTF(("xbdstart(%p): b_bcount = %ld\n", bp, (long)bp->b_bcount));
 
@@ -990,13 +1038,6 @@ xbdstart(struct dk_softc *dksc, struct buf *bp)
        }
 
        xbdreq->req_bp = bp;
-       xbdreq->req_data = bp->b_data;
-       if ((vaddr_t)bp->b_data & (XEN_BSIZE - 1)) {
-               if (__predict_false(xbd_map_align(xbdreq) != 0)) {
-                       ret = -1;
-                       goto out;
-               }
-       }
        /* now we're sure we'll send this buf */
        disk_busy(&dksc->sc_dkdev);
        SLIST_REMOVE_HEAD(&sc->sc_xbdreq_head, req_next);
@@ -1006,8 +1047,6 @@ xbdstart(struct dk_softc *dksc, struct buf *bp)
        req->sector_number = bp->b_rawblkno;
        req->handle = sc->sc_handle;
 
-       va = (vaddr_t)xbdreq->req_data & ~PAGE_MASK;
-       off = (vaddr_t)xbdreq->req_data & PAGE_MASK;
        if (bp->b_rawblkno + bp->b_bcount / DEV_BSIZE >= sc->sc_xbdsize) {
                bcount = (sc->sc_xbdsize - bp->b_rawblkno) * DEV_BSIZE;
                bp->b_resid = bp->b_bcount - bcount;
@@ -1020,8 +1059,32 @@ xbdstart(struct dk_softc *dksc, struct buf *bp)
                bcount = XBD_MAX_XFER;
        }
        for (seg = 0; bcount > 0;) {
-               pmap_extract_ma(pmap_kernel(), va, &ma);
-               KASSERT((ma & (XEN_BSIZE - 1)) == 0);
+               /* Get an already mapped grant or map a new one */
+               if (!SLIST_EMPTY(&sc->sc_free_grants)) {
+                       gnt = SLIST_FIRST(&sc->sc_free_grants);
+                       SLIST_REMOVE_HEAD(&sc->sc_free_grants, gnt_next);
+               } else {
+                       s = splvm();
+                       rc = uvm_km_kmem_alloc(kmem_va_arena, PAGE_SIZE,
+                                              (VM_NOSLEEP | VM_INSTANTFIT),
+                                              (vmem_addr_t *)&va_newmap);
+                       splx(s);
+                       if (__predict_false(rc != 0))
+                               panic("xbdstart: unable to map memory");
+                       pmap_extract_ma(pmap_kernel(), va_newmap, &ma);
+                       KASSERT((ma & (XEN_BSIZE - 1)) == 0);
+                       gnt = malloc(sizeof(struct grant), M_DEVBUF, M_NOWAIT);
+                       if (__predict_false(xengnt_grant_access(
+                       sc->sc_xbusd->xbusd_otherend_id, ma, 0,
+                       &gnt->gref)))
+                               panic("xbdstart: xengnt_grant_access");
+                       gnt->va = va_newmap;
+               }
+               xbdreq->req_data = (void *)gnt->va;
+               if (seg == 0) {
+                       off = (vaddr_t)xbdreq->req_data & PAGE_MASK;
+                       va = (vaddr_t)bp->b_data;
+               }
                if (bcount > PAGE_SIZE - off)
                        nbytes = PAGE_SIZE - off;
                else
@@ -1031,11 +1094,12 @@ xbdstart(struct dk_softc *dksc, struct buf *bp)
                req->seg[seg].last_sect = (off >> XEN_BSHIFT) + nsects - 1;
                KASSERT(req->seg[seg].first_sect <= req->seg[seg].last_sect);
                KASSERT(req->seg[seg].last_sect < 8);
-               if (__predict_false(xengnt_grant_access(
-                   sc->sc_xbusd->xbusd_otherend_id, ma,
-                   (bp->b_flags & B_READ) == 0, &xbdreq->req_gntref[seg])))
-                       panic("xbdstart: xengnt_grant_access"); /* XXX XXX !!! 
*/
-               req->seg[seg].gref = xbdreq->req_gntref[seg];
+               if (req->operation == BLKIF_OP_WRITE)
+                       /* Copy data from the buf to the request */
+                       memcpy((void *)(gnt->va + off), (void *)(va + off), 
nbytes);
+
+               req->seg[seg].gref = gnt->gref;
+               xbdreq->req_gntref[seg] = gnt;
                seg++;
                KASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
                va += PAGE_SIZE;
@@ -1063,33 +1127,3 @@ err:
        biodone(bp);
        return 0;
 }
-
-static int
-xbd_map_align(struct xbd_req *req)
-{
-       int s = splvm();
-       int rc;
-
-       rc = uvm_km_kmem_alloc(kmem_va_arena,
-           req->req_bp->b_bcount, (VM_NOSLEEP | VM_INSTANTFIT),
-           (vmem_addr_t *)&req->req_data);
-       splx(s);
-       if (__predict_false(rc != 0))
-               return ENOMEM;
-       if ((req->req_bp->b_flags & B_READ) == 0)
-               memcpy(req->req_data, req->req_bp->b_data,
-                   req->req_bp->b_bcount);
-       return 0;
-}
-
-static void
-xbd_unmap_align(struct xbd_req *req)
-{
-       int s;
-       if (req->req_bp->b_flags & B_READ)
-               memcpy(req->req_bp->b_data, req->req_data,
-                   req->req_bp->b_bcount);
-       s = splvm();
-       uvm_km_kmem_free(kmem_va_arena, (vaddr_t)req->req_data, 
req->req_bp->b_bcount);
-       splx(s);
-}
-- 
1.7.7.5 (Apple Git-26)


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.