[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-users] Re: [Xen-devel] VM disk I/O limit patch
On Tue, 21 Jun 2011 09:33:37 -0400 Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx> wrote: > On Tue, Jun 21, 2011 at 04:29:35PM +0800, Andrew Xu wrote: > > Hi all, > > > > I add a blkback QoS patch. > > What tree is this against? This patch is based on suse11.sp1(2.6.32) xen-blkback source. (2.6.18 "Xenlinux" based source trees?) > There is a xen-blkback in 3.0-rc4, can you rebase > it against that please. > Ok, I will rebase it. > What is the patch solving? > With this path, you can set different speed I/O for different VM disk. For example, I set vm17-disk1 4MKB/s vm17-disk2 1MKB/s vm18-disk3 3MKB/s I/O speed, by writing follow xenstore key-values. /local/domain/17/device/vbd/768/tokens-rate = "4096" /local/domain/17/device/vbd/2048/tokens-rate = "1024" /local/domain/18/device/vbd/768/tokens-rate = "3096" > Why can't it be done with dm-ioband? Of cause, I/O speed limit also can be done with dm-ioband. But with my patch, there is no need to load dm-ioband any more. This patch do speed-limit more close disk, more lightweight. How it doing? 1) It record how many sectors(512Bytes) have submit to disk in 1 second. 2) If it exceed 2*tokens-rate, then sleep (1 - time passed) seconds. 3) Next second come, going to 1) > > You can config(dynamic/static) different I/O speed for different VM disk > > by this patch. > > > > ---------------------------------------------------------------------------- > > > > diff -urNp blkback/blkback.c blkback-qos/blkback.c > > --- blkback/blkback.c 2011-06-22 07:54:19.000000000 +0800 > > +++ blkback-qos/blkback.c 2011-06-22 07:53:18.000000000 +0800 > > @@ -44,6 +44,11 @@ > > #include <asm/hypervisor.h> > > #include "common.h" > > > > +#undef DPRINTK > > +#define DPRINTK(fmt, args...) \ > > + printk("blkback/blkback (%s:%d) " fmt ".\n", \ > > + __FUNCTION__, __LINE__, ##args) > > + > > /* > > * These are rather arbitrary. They are fairly large because adjacent > > requests > > * pulled from a communication ring are quite likely to end up being part > > of > > @@ -110,7 +115,8 @@ static inline unsigned long vaddr(pendin > > static int do_block_io_op(blkif_t *blkif); > > static int dispatch_rw_block_io(blkif_t *blkif, > > blkif_request_t *req, > > - pending_req_t *pending_req); > > + pending_req_t *pending_req, > > + int *done_nr_sects); > > static void make_response(blkif_t *blkif, u64 id, > > unsigned short op, int st); > > > > @@ -206,10 +212,20 @@ static void print_stats(blkif_t *blkif) > > blkif->st_pk_req = 0; > > } > > > > +static void refill_reqcount(blkif_t *blkif) > > +{ > > + blkif->reqtime = jiffies + msecs_to_jiffies(1000); > > + blkif->reqcount = blkif->reqrate; > > + if (blkif->reqcount < blkif->reqmin) > > + blkif->reqcount = blkif->reqmin; > > +} > > + > > int blkif_schedule(void *arg) > > { > > blkif_t *blkif = arg; > > struct vbd *vbd = &blkif->vbd; > > + int ret = 0; > > + struct timeval cur_time; > > > > blkif_get(blkif); > > > > @@ -232,12 +248,34 @@ int blkif_schedule(void *arg) > > blkif->waiting_reqs = 0; > > smp_mb(); /* clear flag *before* checking for work */ > > > > - if (do_block_io_op(blkif)) > > + ret = do_block_io_op(blkif); > > + if (ret) > > blkif->waiting_reqs = 1; > > unplug_queue(blkif); > > > > + if(blkif->reqmin){ > > + if(2 == ret && (blkif->reqtime > jiffies)){ > > + jiffies_to_timeval(jiffies, &cur_time); > > + if(log_stats && (cur_time.tv_sec % 10 ==1 )) > > + printk(KERN_DEBUG "%s: going to sleep > > %d millsecs(rate=%d)\n", > > + current->comm, > > + > > jiffies_to_msecs(blkif->reqtime - jiffies), > > + blkif->reqrate); > > + > > + set_current_state(TASK_INTERRUPTIBLE); > > + schedule_timeout(blkif->reqtime - jiffies); > > + > > + if(log_stats && (cur_time.tv_sec % 10 ==1 )) > > + printk(KERN_DEBUG "%s: sleep > > end(rate=%d)\n", > > + > > current->comm,blkif->reqrate); > > + } > > + if (time_after(jiffies, blkif->reqtime)) > > + refill_reqcount(blkif); > > + } > > + > > if (log_stats && time_after(jiffies, blkif->st_print)) > > print_stats(blkif); > > + > > } > > > > if (log_stats) > > @@ -306,7 +344,6 @@ irqreturn_t blkif_be_int(int irq, void * > > /****************************************************************** > > * DOWNWARD CALLS -- These interface with the block-device layer proper. > > */ > > - > > static int do_block_io_op(blkif_t *blkif) > > { > > blkif_back_rings_t *blk_rings = &blkif->blk_rings; > > @@ -314,15 +351,27 @@ static int do_block_io_op(blkif_t *blkif > > pending_req_t *pending_req; > > RING_IDX rc, rp; > > int more_to_do = 0, ret; > > + static int last_done_nr_sects = 0; > > > > rc = blk_rings->common.req_cons; > > rp = blk_rings->common.sring->req_prod; > > rmb(); /* Ensure we see queued requests up to 'rp'. */ > > + > > + if (blkif->reqmin && blkif->reqcount <= 0) > > + return (rc != rp) ? 2 : 0; > > > > while ((rc != rp) || (blkif->is_suspended_req)) { > > > > if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) > > break; > > + > > + if(blkif->reqmin){ > > + blkif->reqcount -= last_done_nr_sects; > > + if (blkif->reqcount <= 0) { > > + more_to_do = 2; > > + break; > > + } > > + } > > > > if (kthread_should_stop()) { > > more_to_do = 1; > > @@ -367,14 +416,14 @@ handle_request: > > switch (req.operation) { > > case BLKIF_OP_READ: > > blkif->st_rd_req++; > > - ret = dispatch_rw_block_io(blkif, &req, pending_req); > > + ret = dispatch_rw_block_io(blkif, &req, > > pending_req,&last_done_nr_sects); > > break; > > case BLKIF_OP_WRITE_BARRIER: > > blkif->st_br_req++; > > /* fall through */ > > case BLKIF_OP_WRITE: > > blkif->st_wr_req++; > > - ret = dispatch_rw_block_io(blkif, &req, pending_req); > > + ret = dispatch_rw_block_io(blkif, &req, > > pending_req,&last_done_nr_sects); > > break; > > case BLKIF_OP_PACKET: > > DPRINTK("error: block operation BLKIF_OP_PACKET not > > implemented\n"); > > @@ -412,9 +461,29 @@ handle_request: > > return more_to_do; > > } > > > > +static char* operation2str(int operation) > > +{ > > + char* ret_str = NULL; > > + switch (operation) { > > + case BLKIF_OP_READ: > > + ret_str = "READ"; > > + break; > > + case BLKIF_OP_WRITE: > > + ret_str = "WRITE"; > > + break; > > + case BLKIF_OP_WRITE_BARRIER: > > + ret_str = "WRITE_BARRIER"; > > + break; > > + default: > > + ret_str = "0"; > > + } > > + return ret_str; > > +} > > + > > static int dispatch_rw_block_io(blkif_t *blkif, > > blkif_request_t *req, > > - pending_req_t *pending_req) > > + pending_req_t *pending_req, > > + int *done_nr_sects) > > { > > extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); > > struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; > > @@ -426,6 +495,9 @@ static int dispatch_rw_block_io(blkif_t > > struct bio *bio = NULL; > > int ret, i; > > int operation; > > + struct timeval cur_time; > > + > > + *done_nr_sects = 0; > > > > switch (req->operation) { > > case BLKIF_OP_READ: > > @@ -582,6 +654,12 @@ static int dispatch_rw_block_io(blkif_t > > else if (operation == WRITE || operation == WRITE_BARRIER) > > blkif->st_wr_sect += preq.nr_sects; > > > > + *done_nr_sects = preq.nr_sects; > > + jiffies_to_timeval(jiffies, &cur_time); > > + if ((log_stats == 2) && (cur_time.tv_sec % 10 ==1 )) > > + printk(KERN_DEBUG " operation=%s sects=%d\n", > > + operation2str(req->operation),preq.nr_sects); > > + > > return 0; > > > > fail_flush: > > @@ -695,6 +773,8 @@ static int __init blkif_init(void) > > > > blkif_xenbus_init(); > > > > + DPRINTK("blkif_inited\n"); > > + > > return 0; > > > > out_of_memory: > > diff -urNp blkback/cdrom.c blkback-qos/cdrom.c > > --- blkback/cdrom.c 2010-05-20 18:07:00.000000000 +0800 > > +++ blkback-qos/cdrom.c 2011-06-22 07:34:50.000000000 +0800 > > @@ -35,9 +35,9 @@ > > #include "common.h" > > > > #undef DPRINTK > > -#define DPRINTK(_f, _a...) \ > > - printk("(%s() file=%s, line=%d) " _f "\n", \ > > - __PRETTY_FUNCTION__, __FILE__ , __LINE__ , ##_a ) > > +#define DPRINTK(fmt, args...) \ > > + printk("blkback/cdrom (%s:%d) " fmt ".\n", \ > > + __FUNCTION__, __LINE__, ##args) > > > > > > #define MEDIA_PRESENT "media-present" > > diff -urNp blkback/common.h blkback-qos/common.h > > --- blkback/common.h 2010-05-20 18:07:00.000000000 +0800 > > +++ blkback-qos/common.h 2011-06-22 07:34:50.000000000 +0800 > > @@ -100,8 +100,17 @@ typedef struct blkif_st { > > > > grant_handle_t shmem_handle; > > grant_ref_t shmem_ref; > > + > > + /* qos information */ > > + unsigned long reqtime; > > + int reqcount; > > + int reqmin; > > + int reqrate; > > + > > } blkif_t; > > > > +#define VBD_QOS_MIN_RATE_LIMIT 2*1024 /* > > 1MBs */ > > + > > struct backend_info > > { > > struct xenbus_device *dev; > > @@ -111,6 +120,8 @@ struct backend_info > > unsigned major; > > unsigned minor; > > char *mode; > > + struct xenbus_watch rate_watch; > > + int have_rate_watch; > > }; > > > > blkif_t *blkif_alloc(domid_t domid); > > diff -urNp blkback/vbd.c blkback-qos/vbd.c > > --- blkback/vbd.c 2010-05-20 18:07:00.000000000 +0800 > > +++ blkback-qos/vbd.c 2011-06-22 07:34:50.000000000 +0800 > > @@ -35,6 +35,11 @@ > > #define vbd_sz(_v) ((_v)->bdev->bd_part ? > > \ > > (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk)) > > > > +#undef DPRINTK > > +#define DPRINTK(fmt, args...) \ > > + printk("blkback/vbd (%s:%d) " fmt ".\n", \ > > + __FUNCTION__, __LINE__, ##args) > > + > > unsigned long long vbd_size(struct vbd *vbd) > > { > > return vbd_sz(vbd); > > @@ -87,7 +92,7 @@ int vbd_create(blkif_t *blkif, blkif_vde > > if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) > > vbd->type |= VDISK_REMOVABLE; > > > > - DPRINTK("Successful creation of handle=%04x (dom=%u)\n", > > + DPRINTK("Successful creation of handle=%04x (dom=%u)", > > handle, blkif->domid); > > return 0; > > } > > diff -urNp blkback/xenbus.c blkback-qos/xenbus.c > > --- blkback/xenbus.c 2010-05-20 18:07:00.000000000 +0800 > > +++ blkback-qos/xenbus.c 2011-06-22 07:34:50.000000000 +0800 > > @@ -25,13 +25,14 @@ > > > > #undef DPRINTK > > #define DPRINTK(fmt, args...) \ > > - pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \ > > + printk("blkback/xenbus (%s:%d) " fmt ".\n", \ > > __FUNCTION__, __LINE__, ##args) > > > > static void connect(struct backend_info *); > > static int connect_ring(struct backend_info *); > > static void backend_changed(struct xenbus_watch *, const char **, > > unsigned int); > > +static void unregister_rate_watch(struct backend_info *be); > > > > static int blkback_name(blkif_t *blkif, char *buf) > > { > > @@ -59,8 +60,10 @@ static void update_blkif_status(blkif_t > > char name[TASK_COMM_LEN]; > > > > /* Not ready to connect? */ > > - if (!blkif->irq || !blkif->vbd.bdev) > > + if (!blkif->irq || !blkif->vbd.bdev){ > > + DPRINTK("Not ready to connect"); > > return; > > + } > > > > /* Already connected? */ > > if (blkif->be->dev->state == XenbusStateConnected) > > @@ -193,6 +196,8 @@ static int blkback_remove(struct xenbus_ > > be->cdrom_watch.node = NULL; > > } > > > > + unregister_rate_watch(be); > > + > > if (be->blkif) { > > blkif_disconnect(be->blkif); > > vbd_free(&be->blkif->vbd); > > @@ -251,6 +256,10 @@ static int blkback_probe(struct xenbus_d > > > > err = xenbus_watch_path2(dev, dev->nodename, "physical-device", > > &be->backend_watch, backend_changed); > > + > > + DPRINTK("blkback_probe called"); > > + DPRINTK("dev->nodename=%s/physical-device",dev->nodename); > > + > > if (err) > > goto fail; > > > > @@ -266,7 +275,6 @@ fail: > > return err; > > } > > > > - > > /** > > * Callback received when the hotplug scripts have placed the > > physical-device > > * node. Read it and the mode node, and create a vbd. If the frontend is > > @@ -283,8 +291,9 @@ static void backend_changed(struct xenbu > > struct xenbus_device *dev = be->dev; > > int cdrom = 0; > > char *device_type; > > + char name[TASK_COMM_LEN]; > > > > - DPRINTK(""); > > + DPRINTK("backend_changed called"); > > > > err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x", > > &major, &minor); > > @@ -322,6 +331,34 @@ static void backend_changed(struct xenbu > > kfree(device_type); > > } > > > > + /* gather information about QoS policy for this device. */ > > + err = blkback_name(be->blkif, name); > > + if (err) { > > + xenbus_dev_error(be->dev, err, "get blkback dev name"); > > + return; > > + } > > + > > + err = xenbus_gather(XBT_NIL, dev->otherend, > > + "tokens-rate", "%d", &be->blkif->reqrate, > > + NULL); > > + if(err){ > > + DPRINTK("%s xenbus_gather(tokens-min,tokens-rate) error",name); > > + }else{ > > + if(be->blkif->reqrate <= 0){ > > + be->blkif->reqmin = 0 ; > > + DPRINTK("%s tokens-rate == 0,no limit",name); > > + }else{ > > + DPRINTK("%s > > xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate); > > + be->blkif->reqrate *= 2; > > + be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT; > > + if(be->blkif->reqmin > be->blkif->reqrate){ > > + be->blkif->reqrate = be->blkif->reqmin; > > + DPRINTK("%s reset default > > value(tokens-rate=%d)",name,be->blkif->reqrate); > > + } > > + } > > + } > > + be->blkif->reqtime = jiffies; > > + > > if (be->major == 0 && be->minor == 0) { > > /* Front end dir is a number, which is used as the handle. */ > > > > @@ -414,6 +451,49 @@ static void frontend_changed(struct xenb > > > > /* ** Connection ** */ > > > > +static void unregister_rate_watch(struct backend_info *be) > > +{ > > + if (be->have_rate_watch) { > > + unregister_xenbus_watch(&be->rate_watch); > > + kfree(be->rate_watch.node); > > + } > > + be->have_rate_watch = 0; > > +} > > + > > +static void rate_changed(struct xenbus_watch *watch, > > + const char **vec, unsigned int len) > > +{ > > + > > + struct backend_info *be=container_of(watch,struct backend_info, > > rate_watch); > > + int err; > > + char name[TASK_COMM_LEN]; > > + > > + err = blkback_name(be->blkif, name); > > + if (err) { > > + xenbus_dev_error(be->dev, err, "get blkback dev name"); > > + return; > > + } > > + > > + err = xenbus_gather(XBT_NIL,be->dev->otherend, > > + "tokens-rate", "%d", > > + &be->blkif->reqrate,NULL); > > + if(err){ > > + DPRINTK("%s xenbus_gather(tokens-rate) error",name); > > + }else{ > > + if(be->blkif->reqrate <= 0){ > > + be->blkif->reqmin = 0; > > + DPRINTK("%s tokens-rate == 0,no limit",name); > > + }else{ > > + DPRINTK("%s > > xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate); > > + be->blkif->reqrate *= 2; > > + be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT; > > + if(be->blkif->reqmin > be->blkif->reqrate){ > > + be->blkif->reqrate = be->blkif->reqmin; > > + DPRINTK("%s reset default > > value(tokens-rate=%d)",name,be->blkif->reqrate); > > + } > > + } > > + } > > +} > > > > /** > > * Write the physical details regarding the block device to the store, and > > @@ -439,6 +519,14 @@ again: > > if (err) > > goto abort; > > > > + /*add by andrew for centos pv*/ > > + err = xenbus_printf(xbt, dev->nodename,"feature-flush-cache", "1"); > > + if (err){ > > + xenbus_dev_fatal(dev, err, "writing %s/feature-flush-cache", > > + dev->nodename); > > + goto abort; > > + } > > + > > err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", > > vbd_size(&be->blkif->vbd)); > > if (err) { > > @@ -469,11 +557,22 @@ again: > > if (err) > > xenbus_dev_fatal(dev, err, "ending transaction"); > > > > + DPRINTK("xenbus_switch_to XenbusStateConnected"); > > + > > err = xenbus_switch_state(dev, XenbusStateConnected); > > if (err) > > xenbus_dev_fatal(dev, err, "switching to Connected state", > > dev->nodename); > > > > + unregister_rate_watch(be); > > + err=xenbus_watch_path2(dev, dev->otherend, "tokens-rate", > > + > > &be->rate_watch,rate_changed); > > + if (!err) > > + be->have_rate_watch = 1; > > + else > > + xenbus_dev_fatal(dev, err, "watching tokens-rate", > > + dev->nodename); > > + > > return; > > abort: > > xenbus_transaction_end(xbt, 1); > > > > _______________________________________________ > > Xen-devel mailing list > > Xen-devel@xxxxxxxxxxxxxxxxxxx > > http://lists.xensource.com/xen-devel > > > _______________________________________________ > Xen-users mailing list > Xen-users@xxxxxxxxxxxxxxxxxxx > http://lists.xensource.com/xen-users ************************************************** 徐安(Andrew Xu) 部门 :云快线 - 运营支撑中心 - 研发中心 手机 : 18910391796 E-mail:xu.an@xxxxxxxxxx 地址 :北京市朝阳区酒仙桥东路1号M5楼 ************************************************** _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |