[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 1/1] drivers/block/xen-blkback: Limit blkback i/o



From: Vasiliy Tolstov <vase@xxxxxxxx>

This patch provide ability to limit i/o for each domU block device.
With this patch dom0 administrator can specify maximum iops for
each block device. Changes apply dinamicaly and domU does not need
shutdown (in case of dm-ioband). Another good thing that dom0 may
not use CFQ scheduler.

Afer apply this patch we can control domU disk speed by writing
needed iops maximum to specific block device.

via sysfs:
echo 1500 > /sys/devices/xen-backend/vbd-1-51712/qos/reqrate

via xenstore:
xenstore write /local/domain/1/device/vbd/51712/reqrate 1500

Current xen i/o limiting solutions have following disadvantages:

1) dm-ioband
  It need to create another dm layer on top of block device. Lacks
  of ability to change weight on the fly (needs recreate layer).
  Its not in kernel yet. Patches need to backport/forwardport to
  specific kernel version. Under our heavy load, sometimes
  dm-ioband layer crash dom0. If we use dm-ioband on srp->lvm->raid1
  setup and srp target disconnects dm-ioband may breaks data and
  domU fs have many errors.

2) cgroups
  Very good thing. But in our setup we can't use it. cgroups needs
  CFQ scheduler, but CFQ not apply to bio devices see device-mapper
  list http://goo.gl/YHiyI
  Our setup contains 2 storage nodes that export disks by srp.
  On each storage we have lvm (not clvm). Each domU have disk on lvm.
  Before start domain on xen node we construct raid1 from two lvm vg.
  In this case CFQ scheduler may be applied only to srp disk (/dev/sd*),
  but in this case we only limit all domU on this xen node in the same
  time.

Signed-off-by: Vasiliy Tolstov <vase@xxxxxxxx>
---
 drivers/block/xen-blkback/blkback.c |   35 +++++++-
 drivers/block/xen-blkback/common.h  |    5 ++
 drivers/block/xen-blkback/xenbus.c  |  152 +++++++++++++++++++++++++++++++++++
 3 files changed, 191 insertions(+), 1 deletion(-)

diff --git a/drivers/block/xen-blkback/blkback.c 
b/drivers/block/xen-blkback/blkback.c
index 74374fb..0672ab0 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -387,10 +387,18 @@ static void print_stats(struct xen_blkif *blkif)
        blkif->st_ds_req = 0;
 }
 
+static void refill_iops(blkif_t *blkif)
+{
+       blkif->reqtime = jiffies + msecs_to_jiffies(1000);
+       blkif->reqcount = 0;
+}
+
 int xen_blkif_schedule(void *arg)
 {
        struct xen_blkif *blkif = arg;
        struct xen_vbd *vbd = &blkif->vbd;
+       int     ret = 0;
+       struct timeval cur_time;
 
        xen_blkif_get(blkif);
 
@@ -411,9 +419,20 @@ int xen_blkif_schedule(void *arg)
                blkif->waiting_reqs = 0;
                smp_mb(); /* clear flag *before* checking for work */
 
-               if (do_block_io_op(blkif))
+               ret = do_block_io_op(blkif);
+               if (ret)
                        blkif->waiting_reqs = 1;
 
+               if (blkif->reqrate) {
+                       if (2 == ret && (blkif->reqtime > jiffies)) {
+                               jiffies_to_timeval(jiffies, &cur_time);
+                               set_current_state(TASK_INTERRUPTIBLE);
+                               schedule_timeout(blkif->reqtime - jiffies);
+                       }
+                       if (time_after(jiffies, blkif->reqtime))
+                               refill_iops(blkif);
+               }
+
                if (log_stats && time_after(jiffies, blkif->st_print))
                        print_stats(blkif);
        }
@@ -760,6 +779,10 @@ __do_block_io_op(struct xen_blkif *blkif)
        rp = blk_rings->common.sring->req_prod;
        rmb(); /* Ensure we see queued requests up to 'rp'. */
 
+       if (blkif->reqrate && (blkif->reqcount >= blkif->reqrate)) {
+               return (rc != rp) ? 2 : 0;
+       }
+
        while (rc != rp) {
 
                if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
@@ -770,6 +793,13 @@ __do_block_io_op(struct xen_blkif *blkif)
                        break;
                }
 
+               if (blkif->reqrate) {
+                       if (blkif->reqcount >= blkif->reqrate) {
+                               more_to_do = 2;
+                               break;
+                       }
+               }
+
                pending_req = alloc_req();
                if (NULL == pending_req) {
                        blkif->st_oo_req++;
@@ -792,6 +822,7 @@ __do_block_io_op(struct xen_blkif *blkif)
                }
                blk_rings->common.req_cons = ++rc; /* before make_response() */
 
+               blkif->reqcount++;
                /* Apply all sanity checks to /private copy/ of request. */
                barrier();
                if (unlikely(req.operation == BLKIF_OP_DISCARD)) {
@@ -842,6 +873,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
        struct blk_plug plug;
        bool drain = false;
        struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+       struct timeval cur_time;
 
        switch (req->operation) {
        case BLKIF_OP_READ:
@@ -992,6 +1024,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
        else if (operation & WRITE)
                blkif->st_wr_sect += preq.nr_sects;
 
+       jiffies_to_timeval(jiffies, &cur_time);
        return 0;
 
  fail_flush:
diff --git a/drivers/block/xen-blkback/common.h 
b/drivers/block/xen-blkback/common.h
index 6072390..0552ce3 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -206,6 +206,11 @@ struct xen_blkif {
        struct rb_root          persistent_gnts;
        unsigned int            persistent_gnt_c;
 
+       /* qos information */
+       unsigned long   reqtime;
+       int    reqcount;
+       int    reqrate;
+
        /* statistics */
        unsigned long           st_print;
        int                     st_rd_req;
diff --git a/drivers/block/xen-blkback/xenbus.c 
b/drivers/block/xen-blkback/xenbus.c
index 6398072..f8afe76 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -25,6 +25,7 @@ struct backend_info {
        struct xenbus_device    *dev;
        struct xen_blkif        *blkif;
        struct xenbus_watch     backend_watch;
+       struct xenbus_watch     reqrate_watch;
        unsigned                major;
        unsigned                minor;
        char                    *mode;
@@ -230,6 +231,79 @@ int __init xen_blkif_interface_init(void)
        }                                                               \
        static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
 
+static ssize_t
+show_reqrate(struct device *_dev, struct device_attribute *attr, char *buf)
+{
+       ssize_t ret = -ENODEV;
+       struct xenbus_device *dev;
+       struct backend_info *be;
+
+       if (!get_device(_dev))
+               return ret;
+
+       dev = to_xenbus_device(_dev);
+       be = dev_get_drvdata(&dev->dev);
+
+       if (be != NULL)
+               ret = sprintf(buf, "%d\n", be->blkif->reqrate);
+
+       put_device(_dev);
+
+       return ret;
+}
+
+static ssize_t
+store_reqrate(struct device *_dev, struct device_attribute *attr,
+               const char *buf, size_t size)
+{
+       int value;
+       struct xenbus_device *dev;
+       struct backend_info *be;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (!get_device(_dev))
+               return -ENODEV;
+
+       if (sscanf(buf, "%d", &value) != 1)
+               return -EINVAL;
+
+       dev = to_xenbus_device(_dev);
+       be = dev_get_drvdata(&dev->dev);
+
+       if (be != NULL)
+               be->blkif->reqrate = value;
+
+       put_device(_dev);
+
+       return size;
+}
+static DEVICE_ATTR(reqrate, S_IRUGO | S_IWUSR, show_reqrate,
+                       store_reqrate);
+
+static ssize_t
+show_reqcount(struct device *_dev, struct device_attribute *attr, char *buf)
+{
+       ssize_t ret = -ENODEV;
+       struct xenbus_device *dev;
+       struct backend_info *be;
+
+       if (!get_device(_dev))
+               return ret;
+
+       dev = to_xenbus_device(_dev);
+       be = dev_get_drvdata(&dev->dev);
+
+       if (be != NULL)
+               ret = sprintf(buf, "%d\n", be->blkif->reqcount);
+
+       put_device(_dev);
+
+       return ret;
+}
+static DEVICE_ATTR(reqcount, S_IRUGO | S_IWUSR, show_reqcount, NULL);
+
 VBD_SHOW(oo_req,  "%d\n", be->blkif->st_oo_req);
 VBD_SHOW(rd_req,  "%d\n", be->blkif->st_rd_req);
 VBD_SHOW(wr_req,  "%d\n", be->blkif->st_wr_req);
@@ -254,6 +328,17 @@ static struct attribute_group xen_vbdstat_group = {
        .attrs = xen_vbdstat_attrs,
 };
 
+static struct attribute *vbdreq_attrs[] = {
+       &dev_attr_reqrate.attr,
+       &dev_attr_reqcount.attr,
+       NULL
+};
+
+static const struct attribute_group vbdreq_group = {
+       .name = "qos",
+       .attrs = vbdreq_attrs,
+};
+
 VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
 VBD_SHOW(mode, "%s\n", be->mode);
 
@@ -273,8 +358,13 @@ static int xenvbd_sysfs_addif(struct xenbus_device *dev)
        if (error)
                goto fail3;
 
+       error = sysfs_create_group(&dev->dev.kobj, &xen_vbdreq_group);
+       if (error)
+               goto fail4;
+
        return 0;
 
+fail4: sysfs_remove_group(&dev->dev.kobj, &xen_vbdreq_group);
 fail3: sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group);
 fail2: device_remove_file(&dev->dev, &dev_attr_mode);
 fail1: device_remove_file(&dev->dev, &dev_attr_physical_device);
@@ -283,6 +373,7 @@ fail1:      device_remove_file(&dev->dev, 
&dev_attr_physical_device);
 
 static void xenvbd_sysfs_delif(struct xenbus_device *dev)
 {
+       sysfs_remove_group(&dev->dev.kobj, &xen_vbdreq_group);
        sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group);
        device_remove_file(&dev->dev, &dev_attr_mode);
        device_remove_file(&dev->dev, &dev_attr_physical_device);
@@ -360,6 +451,12 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
                be->backend_watch.node = NULL;
        }
 
+       if (be->reqrate_watch.node) {
+               unregister_xenbus_watch(&be->reqrate_watch);
+               kfree(be->reqrate_watch.node);
+               be->reqrate_watch.node = NULL;
+       }
+
        if (be->blkif) {
                xen_blkif_disconnect(be->blkif);
                xen_vbd_free(&be->blkif->vbd);
@@ -503,6 +600,7 @@ static void backend_changed(struct xenbus_watch *watch,
        struct xenbus_device *dev = be->dev;
        int cdrom = 0;
        char *device_type;
+       char name[TASK_COMM_LEN];
 
        DPRINTK("");
 
@@ -542,6 +640,21 @@ static void backend_changed(struct xenbus_watch *watch,
                kfree(device_type);
        }
 
+       /* gather information about QoS policy for this device. */
+       err = blkback_name(be->blkif, name);
+       if (err) {
+               xenbus_dev_error(be->dev, err, "get blkback dev name");
+               return;
+       }
+
+       err = xenbus_gather(XBT_NIL, dev->otherend,
+                               "reqrate", "%d", &be->blkif->reqrate,
+                               NULL);
+       if (err)
+               DPRINTK("%s xenbus_gather(reqrate) error", name);
+
+       be->blkif->reqtime = jiffies;
+
        if (be->major == 0 && be->minor == 0) {
                /* Front end dir is a number, which is used as the handle. */
 
@@ -645,6 +758,30 @@ static void frontend_changed(struct xenbus_device *dev,
 
 /* ** Connection ** */
 
+static void reqrate_changed(struct xenbus_watch *watch,
+                       const char **vec, unsigned int len)
+{
+       struct backend_info *be = container_of(watch, struct backend_info,
+                                               reqrate_watch);
+       int err;
+       char name[TASK_COMM_LEN];
+
+       err = blkback_name(be->blkif, name);
+       if (err) {
+               xenbus_dev_error(be->dev, err, "get blkback dev name");
+               return;
+       }
+
+       err = xenbus_gather(XBT_NIL, be->dev->otherend,
+                                       "reqrate",  "%d",
+                                       &be->blkif->reqrate, NULL);
+       if (err) {
+               DPRINTK("%s xenbus_gather(reqrate) error", name);
+       } else {
+               if (be->blkif->reqrate <= 0)
+                       be->blkif->reqrate = 0;
+       }
+}
 
 /*
  * Write the physical details regarding the block device to the store, and
@@ -717,6 +854,21 @@ again:
                xenbus_dev_fatal(dev, err, "%s: switching to Connected state",
                                 dev->nodename);
 
+       if (be->reqrate_watch.node) {
+               unregister_xenbus_watch(&be->reqrate_watch);
+               kfree(be->reqrate_watch.node);
+               be->reqrate_watch.node = NULL;
+       }
+
+       err = xenbus_watch_path2(dev, dev->otherend, "reqrate",
+                                       &be->reqrate_watch,
+                                       reqrate_changed);
+       if (err) {
+               xenbus_dev_fatal(dev, err, "%s: watching reqrate",
+                                       dev->nodename);
+               goto abort;
+       }
+
        return;
  abort:
        xenbus_transaction_end(xbt, 1);
-- 
1.7.9.5


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.