Xen project Mailing List

Re: [Xen-devel] [RFC PATCH 3/3] Implement 3-level event channel routines.

From: "Jan Beulich" <JBeulich@xxxxxxxx>

Date: Thu, 03 Jan 2013 11:35:32 +0000

Cc: Wei Liu <liuw@xxxxxxxxx>, xen-devel@xxxxxxxxxxxxx

Delivery-date: Thu, 03 Jan 2013 11:35:59 +0000

List-id: Xen developer discussion <xen-devel.lists.xen.org>

>>> On 31.12.12 at 19:22, Wei Liu <wei.liu2@xxxxxxxxxx> wrote: > @@ -633,7 +712,50 @@ static void evtchn_set_pending(struct vcpu *v, int port) > { > vcpu_mark_events_pending(v); > } > - > +} > + > +static void __evtchn_set_pending_l3(struct vcpu *v, int port) > +{ > + struct domain *d = v->domain; > + > + int page_no = port / EVTCHNS_PER_PAGE; Anything used as array index should be unsigned int. > + int offset = port % EVTCHNS_PER_PAGE; > + int l1cb = BITS_PER_EVTCHN_WORD(d)*BITS_PER_EVTCHN_WORD(d); > + int l2cb = BITS_PER_EVTCHN_WORD(d); These always being powers of 2 (afaict), using divides rather than shifts is pretty inefficient (and I don't think the compiler has reasonable chances to recognize this and do the replacement for you). > + > + if ( test_and_set_bit(offset, d->evtchn_pending[page_no]) ) > + return; > + > + if ( !test_bit(offset, d->evtchn_mask[page_no]) && > + !test_and_set_bit(port / l2cb, > + v->evtchn_pending_sel_l2) && > + !test_and_set_bit(port / l1cb, > + &vcpu_info(v, evtchn_pending_sel)) ) Indentation. > @@ -969,6 +1138,290 @@ out: > return rc; > } > > +static void __unmap_l2_sel(struct vcpu *v) > +{ > + unsigned long mfn; > + > + if ( v->evtchn_pending_sel_l2 != 0 ) > + { > + unmap_domain_page_global(v->evtchn_pending_sel_l2); > + mfn = virt_to_mfn(v->evtchn_pending_sel_l2); virt_to_mfn() is not valid on the output of map_domain_page{,_global}() (same further down). Yes, there is at least one example of this in the existing code, but that's wrong too, and is getting eliminated by the 16Tb patch series I'm in the process of putting together. > + put_page_and_type(mfn_to_page(mfn)); > + > + v->evtchn_pending_sel_l2 = 0; > + } > +} > + > +static int __map_l2_sel(struct vcpu *v, unsigned long gfn, unsigned long off) > +{ > + void *mapping; > + int rc = -EINVAL; > + struct page_info *page; > + struct domain *d = v->domain; > + > + if ( off >= PAGE_SIZE ) Is e.g. off == PAGE_SIZE-1 really valid here? As you're mapping guest memory here, _anything_ that is invalid must be rejected, or else you're creating a security hole. > + return rc; > + > + page = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC); > + if ( !page ) > + goto out; Please be consistent here - always use return (as above) or (less preferred by me personally) always use goto. > + > + if ( !get_page_type(page, PGT_writable_page) ) > + { > + put_page(page); > + goto out; > + } > + > + mapping = __map_domain_page_global(page); > + if ( mapping == NULL ) > + { > + put_page_and_type(page); > + rc = -ENOMEM; > + goto out; > + } > + > + v->evtchn_pending_sel_l2 = (unsigned long *)(mapping + off); Pointless cast? > +static int __map_l3_arrays(struct domain *d, unsigned long *pending, > + unsigned long *mask) > +{ > + int i; > + void *pending_mapping, *mask_mapping; > + struct page_info *pending_page, *mask_page; > + unsigned long pending_gfn, mask_gfn; > + int rc = -EINVAL; > + > + for ( i = 0; > + i < MAX_L3_PAGES && pending[i] != 0 && mask[i] != 0; > + i++ ) > + { > + pending_gfn = pending[i]; > + mask_gfn = mask[i]; > + > + pending_page = get_page_from_gfn(d, pending_gfn, NULL, P2M_ALLOC); > + if ( !pending_page ) > + goto err; > + > + mask_page = get_page_from_gfn(d, mask_gfn, NULL, P2M_ALLOC); > + if ( !mask_page ) > + { > + put_page(pending_page); > + goto err; > + } > + > + if ( !get_page_type(pending_page, PGT_writable_page) ) > + { > + put_page(pending_page); > + put_page(mask_page); > + goto err; > + } > + > + if ( !get_page_type(mask_page, PGT_writable_page) ) > + { > + put_page_and_type(pending_page); > + put_page(mask_page); > + goto err; > + } > + > + pending_mapping = __map_domain_page_global(pending_page); > + if ( !pending_mapping ) > + { > + put_page_and_type(pending_page); > + put_page_and_type(mask_page); > + rc = -ENOMEM; > + goto err; > + } > + > + mask_mapping = __map_domain_page_global(mask_page); > + if ( !mask_mapping ) > + { > + unmap_domain_page_global(pending_mapping); > + put_page_and_type(pending_page); > + put_page_and_type(mask_page); > + rc = -ENOMEM; > + goto err; The error paths in this function constitute well over half of it - can this not be consolidated in some way? > +static void __evtchn_migrate_bitmap_l3(struct domain *d) > +{ > + struct vcpu *v; > + > + /* Easy way to migrate, just move existing selector down one level > + * then copy the pending array and mask array */ > + for_each_vcpu ( d, v ) > + { > + memcpy(&v->evtchn_pending_sel_l2[0], > + &vcpu_info(v, evtchn_pending_sel), > + sizeof(vcpu_info(v, evtchn_pending_sel))); > + > + memset(&vcpu_info(v, evtchn_pending_sel), 0, > + sizeof(vcpu_info(v, evtchn_pending_sel))); > + > + set_bit(0, &vcpu_info(v, evtchn_pending_sel)); > + } > + > + memcpy(d->evtchn_pending[0], &shared_info(d, evtchn_pending), > + sizeof(shared_info(d, evtchn_pending))); > + memcpy(d->evtchn_mask[0], &shared_info(d, evtchn_mask), > + sizeof(shared_info(d, evtchn_mask))); > +} > + > +static long __evtchn_register_3level(struct evtchn_register_3level *r) > +{ > + struct domain *d = current->domain; > + unsigned long mfns[r->nr_vcpus]; > + unsigned long offsets[r->nr_vcpus]; > + unsigned char was_up[r->nr_vcpus]; This is an absolute no-go in the hypervisor: Did you consider what happens when a guest has a few hundred vCPU-s? > + int rc, i; > + struct vcpu *v; > + > + if ( d->evtchn_level == 3 ) > + return -EINVAL; > + > + if ( r->nr_vcpus > d->max_vcpus ) > + return -EINVAL; > + > + for ( i = 0; i < MAX_L3_PAGES; i++ ) > + if ( d->evtchn_pending[i] || d->evtchn_mask[i] ) > + return -EINVAL; This and the check immediately below should be redundant with the level == 3 check earlier on (or if they aren't redundant, then you need to fix the code elsewhere). > + > + for_each_vcpu ( d, v ) > + if ( v->evtchn_pending_sel_l2 ) > + return -EINVAL; > + > + if ( copy_from_user(mfns, r->l2sel_mfn, > + sizeof(unsigned long)*r->nr_vcpus) ) copy_from_guest()! > + return -EFAULT; > + > + if ( copy_from_user(offsets, r->l2sel_offset, > + sizeof(unsigned long)*r->nr_vcpus) ) > + return -EFAULT; > + > + /* put cpu offline */ > + for_each_vcpu ( d, v ) > + { > + if ( v == current ) > + was_up[v->vcpu_id] = 1; > + else > + was_up[v->vcpu_id] = !test_and_set_bit(_VPF_down, > + &v->pause_flags); This does not in any way halt that remote vCPU. But anyway - is it really that useful to allow establishing the 3rd level with all vCPU-s already up and event processing already happening for the subject domain? I.e. wouldn't it suffice to require the guest to do the setup _before_ setting up the first event channel (which would imply on vCPU 0 to be active)? > @@ -1251,7 +1712,6 @@ void evtchn_move_pirqs(struct vcpu *v) > spin_unlock(&d->event_lock); > } > > - ??? > static void domain_dump_evtchn_info(struct domain *d) > { > unsigned int port; > @@ -1276,11 +1740,28 @@ static void domain_dump_evtchn_info(struct domain *d) > if ( chn->state == ECS_FREE ) > continue; > > - printk(" %4u [%d/%d]: s=%d n=%d x=%d", > - port, > - !!test_bit(port, &shared_info(d, evtchn_pending)), > - !!test_bit(port, &shared_info(d, evtchn_mask)), > - chn->state, chn->notify_vcpu_id, chn->xen_consumer); > + printk(" %4u", port); > + > + switch ( d->evtchn_level ) > + { > + case 2: > + printk(" [%d/%d]:", > + !!test_bit(port, &shared_info(d, evtchn_pending)), > + !!test_bit(port, &shared_info(d, evtchn_mask))); > + break; > + case 3: > + printk(" [%d/%d]:", > + !!test_bit(offset, d->evtchn_pending[page_no]), > + !!test_bit(offset, d->evtchn_mask[page_no])); Can't you, btw, set [0] of these two to the &shared_info() values, and thus fold code here and perhaps elsewhere? > + break; > + default: > + printk(" %s: unknown event channel level %d for domain %d \n", > + __FUNCTION__, d->evtchn_level, d->domain_id); > + goto out; > + } > + > + printk(" s=%d n=%d x=%d", chn->state, chn->notify_vcpu_id, > + chn->xen_consumer); So still all event channels in one go? > --- a/xen/common/keyhandler.c > +++ b/xen/common/keyhandler.c > @@ -298,15 +298,15 @@ static void dump_domains(unsigned char key) > { > for_each_vcpu ( d, v ) > { > + unsigned int bits = BITS_PER_EVTCHN_WORD(d); > + for (int i = 2; i < d->evtchn_level; i++) Did you check that gcc 4.1.x deals with this non-C89 code fine, with no warning? > --- a/xen/common/schedule.c > +++ b/xen/common/schedule.c > @@ -693,7 +693,7 @@ static long do_poll(struct sched_poll *sched_poll) > goto out; > > rc = 0; > - if ( test_bit(port, &shared_info(d, evtchn_pending)) ) > + if ( evtchn_is_pending(d, port) ) It would likely make the patch better readable if mechanical adjustments like this got split out into a prerequisite patch, even if used only in very few places. > --- a/xen/include/public/event_channel.h > +++ b/xen/include/public/event_channel.h > @@ -71,6 +71,7 @@ > #define EVTCHNOP_bind_vcpu 8 > #define EVTCHNOP_unmask 9 > #define EVTCHNOP_reset 10 > +#define EVTCHNOP_register_nlevel 11 > /* ` } */ > > typedef uint32_t evtchn_port_t; > @@ -258,6 +259,29 @@ struct evtchn_reset { > typedef struct evtchn_reset evtchn_reset_t; > > /* > + * EVTCHNOP_register_nlevel: Register N level event channels. > + * NOTES: > + * 1. currently only 3-level is supported. > + * 2. should fall back to basic 2-level if this call fails. > + */ > +#define MAX_L3_PAGES 8 Without explanation, no-one will easily understand whether this number is made up or the result of some calculation. Also, with it having "MAX" in its name, I'd conclude there can be fewer than 8 pages passed by the guest, yet the interface lacks any indication of how many pages there are being passed. > +struct evtchn_register_3level { > + unsigned long evtchn_pending[MAX_L3_PAGES]; > + unsigned long evtchn_mask[MAX_L3_PAGES]; > + unsigned long *l2sel_mfn; > + unsigned long *l2sel_offset; > + uint32_t nr_vcpus; > +}; > + > +struct evtchn_register_nlevel { > + uint32_t level; > + union { > + struct evtchn_register_3level l3; > + } u; > +}; > +typedef struct evtchn_register_nlevel evtchn_register_nlevel_t; > + > +/* > * ` enum neg_errnoval > * ` HYPERVISOR_event_channel_op_compat(struct evtchn_op *op) > * ` > --- a/xen/include/public/xen.h > +++ b/xen/include/public/xen.h > @@ -554,9 +554,24 @@ DEFINE_XEN_GUEST_HANDLE(multicall_entry_t); > > /* > * Event channel endpoints per domain: > + * 2-level: > * 1024 if a long is 32 bits; 4096 if a long is 64 bits. > + * 3-level: > + * 32k if a long is 32 bits; 256k if a long is 64 bits; > */ > -#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * > 64) > +#define NR_EVENT_CHANNELS_L2 (sizeof(unsigned long) * sizeof(unsigned long) > * 64) > +#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * 64) > +#define NR_EVENT_CHANNELS(x) ({ unsigned int __v = 0; \ This is not a valid thing to do in a public header: You can't replace the object like NR_EVENT_CHANNELS with a function like alternative, you need to retain the old definition for consumers unaware of the new extension. > --- a/xen/include/xen/sched.h > +++ b/xen/include/xen/sched.h > @@ -64,7 +64,8 @@ extern struct domain *dom0; > __v;}) > > #define EVTCHNS_PER_BUCKET 512 > -#define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS / EVTCHNS_PER_BUCKET) > +#define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS_L3 / EVTCHNS_PER_BUCKET) > +#define EVTCHNS_PER_PAGE (PAGE_SIZE * 8) So is this 8 the same as the one above? If so, why don't you use the #define? If not, where does this 8 come from? > @@ -104,7 +105,7 @@ void evtchn_destroy_final(struct domain *d); /* from > complete_domain_destroy */ > > struct waitqueue_vcpu; > > -struct vcpu > +struct vcpu ??? > @@ -275,6 +279,10 @@ struct domain > struct evtchn **evtchn; > spinlock_t event_lock; > unsigned int evtchn_level; > +#define L3_PAGES (NR_EVENT_CHANNELS_L3/BITS_PER_LONG*sizeof(unsigned > long)/PAGE_SIZE) Without the use of BITS_TO_LONGS() and PFN_UP() it is not clear that you don't cut off any non-zero bits here. Hence you ought to either use those macros, or put a validating BUILD_BUG_ON() somewhere. Jan _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.