[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [RFC PATCH 3/3] Implement 3-level event channel routines.



>>> On 31.12.12 at 19:22, Wei Liu <wei.liu2@xxxxxxxxxx> wrote:
> @@ -633,7 +712,50 @@ static void evtchn_set_pending(struct vcpu *v, int port)
>      {
>          vcpu_mark_events_pending(v);
>      }
> -    
> +}
> +
> +static void __evtchn_set_pending_l3(struct vcpu *v, int port)
> +{
> +    struct domain *d = v->domain;
> +
> +    int page_no = port / EVTCHNS_PER_PAGE;

Anything used as array index should be unsigned int.

> +    int offset = port % EVTCHNS_PER_PAGE;
> +    int l1cb = BITS_PER_EVTCHN_WORD(d)*BITS_PER_EVTCHN_WORD(d);
> +    int l2cb = BITS_PER_EVTCHN_WORD(d);

These always being powers of 2 (afaict), using divides rather
than shifts is pretty inefficient (and I don't think the compiler has
reasonable chances to recognize this and do the replacement for
you).

> +
> +    if ( test_and_set_bit(offset, d->evtchn_pending[page_no]) )
> +        return;
> +
> +    if ( !test_bit(offset, d->evtchn_mask[page_no]) &&
> +         !test_and_set_bit(port / l2cb,
> +                           v->evtchn_pending_sel_l2) &&
> +         !test_and_set_bit(port / l1cb,
> +                           &vcpu_info(v, evtchn_pending_sel)) )

Indentation.

> @@ -969,6 +1138,290 @@ out:
>      return rc;
>  }
>  
> +static void __unmap_l2_sel(struct vcpu *v)
> +{
> +    unsigned long mfn;
> +
> +    if ( v->evtchn_pending_sel_l2 != 0 )
> +    {
> +        unmap_domain_page_global(v->evtchn_pending_sel_l2);
> +        mfn = virt_to_mfn(v->evtchn_pending_sel_l2);

virt_to_mfn() is not valid on the output of
map_domain_page{,_global}() (same further down). Yes, there is
at least one example of this in the existing code, but that's wrong
too, and is getting eliminated by the 16Tb patch series I'm in the
process of putting together.

> +        put_page_and_type(mfn_to_page(mfn));
> +
> +        v->evtchn_pending_sel_l2 = 0;
> +    }
> +}
> +
> +static int __map_l2_sel(struct vcpu *v, unsigned long gfn, unsigned long off)
> +{
> +    void *mapping;
> +    int rc = -EINVAL;
> +    struct page_info *page;
> +    struct domain *d = v->domain;
> +
> +    if ( off >= PAGE_SIZE )

Is e.g. off == PAGE_SIZE-1 really valid here? As you're mapping
guest memory here, _anything_ that is invalid must be rejected, or
else you're creating a security hole.

> +        return rc;
> +
> +    page = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC);
> +    if ( !page )
> +        goto out;

Please be consistent here - always use return (as above) or
(less preferred by me personally) always use goto.

> +
> +    if ( !get_page_type(page, PGT_writable_page) )
> +    {
> +        put_page(page);
> +        goto out;
> +    }
> +
> +    mapping = __map_domain_page_global(page);
> +    if ( mapping == NULL )
> +    {
> +        put_page_and_type(page);
> +        rc = -ENOMEM;
> +        goto out;
> +    }
> +
> +    v->evtchn_pending_sel_l2 = (unsigned long *)(mapping + off);

Pointless cast?

> +static int __map_l3_arrays(struct domain *d, unsigned long *pending,
> +                           unsigned long *mask)
> +{
> +    int i;
> +    void *pending_mapping, *mask_mapping;
> +    struct page_info *pending_page, *mask_page;
> +    unsigned long pending_gfn, mask_gfn;
> +    int rc = -EINVAL;
> +
> +    for ( i = 0;
> +          i < MAX_L3_PAGES && pending[i] != 0 && mask[i] != 0;
> +          i++ )
> +    {
> +        pending_gfn = pending[i];
> +        mask_gfn = mask[i];
> +
> +        pending_page = get_page_from_gfn(d, pending_gfn, NULL, P2M_ALLOC);
> +        if ( !pending_page )
> +            goto err;
> +
> +        mask_page = get_page_from_gfn(d, mask_gfn, NULL, P2M_ALLOC);
> +        if ( !mask_page )
> +        {
> +            put_page(pending_page);
> +            goto err;
> +        }
> +
> +        if ( !get_page_type(pending_page, PGT_writable_page) )
> +        {
> +            put_page(pending_page);
> +            put_page(mask_page);
> +            goto err;
> +        }
> +
> +        if ( !get_page_type(mask_page, PGT_writable_page) )
> +        {
> +            put_page_and_type(pending_page);
> +            put_page(mask_page);
> +            goto err;
> +        }
> +
> +        pending_mapping = __map_domain_page_global(pending_page);
> +        if ( !pending_mapping )
> +        {
> +            put_page_and_type(pending_page);
> +            put_page_and_type(mask_page);
> +            rc = -ENOMEM;
> +            goto err;
> +        }
> +
> +        mask_mapping = __map_domain_page_global(mask_page);
> +        if ( !mask_mapping )
> +        {
> +            unmap_domain_page_global(pending_mapping);
> +            put_page_and_type(pending_page);
> +            put_page_and_type(mask_page);
> +            rc = -ENOMEM;
> +            goto err;

The error paths in this function constitute well over half of it - can
this not be consolidated in some way?

> +static void __evtchn_migrate_bitmap_l3(struct domain *d)
> +{
> +    struct vcpu *v;
> +
> +    /* Easy way to migrate, just move existing selector down one level
> +     * then copy the pending array and mask array */
> +    for_each_vcpu ( d, v )
> +    {
> +        memcpy(&v->evtchn_pending_sel_l2[0],
> +               &vcpu_info(v, evtchn_pending_sel),
> +               sizeof(vcpu_info(v, evtchn_pending_sel)));
> +
> +        memset(&vcpu_info(v, evtchn_pending_sel), 0,
> +               sizeof(vcpu_info(v, evtchn_pending_sel)));
> +
> +        set_bit(0, &vcpu_info(v, evtchn_pending_sel));
> +    }
> +
> +    memcpy(d->evtchn_pending[0], &shared_info(d, evtchn_pending),
> +           sizeof(shared_info(d, evtchn_pending)));
> +    memcpy(d->evtchn_mask[0], &shared_info(d, evtchn_mask),
> +           sizeof(shared_info(d, evtchn_mask)));
> +}
> +
> +static long __evtchn_register_3level(struct evtchn_register_3level *r)
> +{
> +    struct domain *d = current->domain;
> +    unsigned long mfns[r->nr_vcpus];
> +    unsigned long offsets[r->nr_vcpus];
> +    unsigned char was_up[r->nr_vcpus];

This is an absolute no-go in the hypervisor: Did you consider what
happens when a guest has a few hundred vCPU-s?

> +    int rc, i;
> +    struct vcpu *v;
> +
> +    if ( d->evtchn_level == 3 )
> +        return -EINVAL;
> +
> +    if ( r->nr_vcpus > d->max_vcpus )
> +        return -EINVAL;
> +
> +    for ( i = 0; i < MAX_L3_PAGES; i++ )
> +        if ( d->evtchn_pending[i] || d->evtchn_mask[i] )
> +            return -EINVAL;

This and the check immediately below should be redundant with
the level == 3 check earlier on (or if they aren't redundant, then
you need to fix the code elsewhere).

> +
> +    for_each_vcpu ( d, v )
> +        if ( v->evtchn_pending_sel_l2 )
> +            return -EINVAL;
> +
> +    if ( copy_from_user(mfns, r->l2sel_mfn,
> +                        sizeof(unsigned long)*r->nr_vcpus) )

copy_from_guest()!

> +        return -EFAULT;
> +
> +    if ( copy_from_user(offsets, r->l2sel_offset,
> +                        sizeof(unsigned long)*r->nr_vcpus) )
> +        return -EFAULT;
> +
> +    /* put cpu offline */
> +    for_each_vcpu ( d, v )
> +    {
> +        if ( v == current )
> +            was_up[v->vcpu_id] = 1;
> +        else
> +            was_up[v->vcpu_id] = !test_and_set_bit(_VPF_down,
> +                                                   &v->pause_flags);

This does not in any way halt that remote vCPU.

But anyway - is it really that useful to allow establishing the 3rd
level with all vCPU-s already up and event processing already
happening for the subject domain? I.e. wouldn't it suffice to
require the guest to do the setup _before_ setting up the first
event channel (which would imply on vCPU 0 to be active)?

> @@ -1251,7 +1712,6 @@ void evtchn_move_pirqs(struct vcpu *v)
>      spin_unlock(&d->event_lock);
>  }
>  
> -

???

>  static void domain_dump_evtchn_info(struct domain *d)
>  {
>      unsigned int port;
> @@ -1276,11 +1740,28 @@ static void domain_dump_evtchn_info(struct domain *d)
>          if ( chn->state == ECS_FREE )
>              continue;
>  
> -        printk("    %4u [%d/%d]: s=%d n=%d x=%d",
> -               port,
> -               !!test_bit(port, &shared_info(d, evtchn_pending)),
> -               !!test_bit(port, &shared_info(d, evtchn_mask)),
> -               chn->state, chn->notify_vcpu_id, chn->xen_consumer);
> +        printk("    %4u", port);
> +
> +        switch ( d->evtchn_level )
> +        {
> +        case 2:
> +            printk(" [%d/%d]:",
> +                   !!test_bit(port, &shared_info(d, evtchn_pending)),
> +                   !!test_bit(port, &shared_info(d, evtchn_mask)));
> +            break;
> +        case 3:
> +            printk(" [%d/%d]:",
> +                   !!test_bit(offset, d->evtchn_pending[page_no]),
> +                   !!test_bit(offset, d->evtchn_mask[page_no]));

Can't you, btw, set [0] of these two to the &shared_info() values,
and thus fold code here and perhaps elsewhere?

> +            break;
> +        default:
> +            printk(" %s: unknown event channel level %d for domain %d \n",
> +                   __FUNCTION__, d->evtchn_level, d->domain_id);
> +            goto out;
> +        }
> +
> +        printk(" s=%d n=%d x=%d", chn->state, chn->notify_vcpu_id,
> +               chn->xen_consumer);

So still all event channels in one go?

> --- a/xen/common/keyhandler.c
> +++ b/xen/common/keyhandler.c
> @@ -298,15 +298,15 @@ static void dump_domains(unsigned char key)
>      {
>          for_each_vcpu ( d, v )
>          {
> +            unsigned int bits = BITS_PER_EVTCHN_WORD(d);
> +            for (int i = 2; i < d->evtchn_level; i++)

Did you check that gcc 4.1.x deals with this non-C89 code fine,
with no warning?

> --- a/xen/common/schedule.c
> +++ b/xen/common/schedule.c
> @@ -693,7 +693,7 @@ static long do_poll(struct sched_poll *sched_poll)
>              goto out;
>  
>          rc = 0;
> -        if ( test_bit(port, &shared_info(d, evtchn_pending)) )
> +        if ( evtchn_is_pending(d, port) )

It would likely make the patch better readable if mechanical
adjustments like this got split out into a prerequisite patch, even
if used only in very few places.

> --- a/xen/include/public/event_channel.h
> +++ b/xen/include/public/event_channel.h
> @@ -71,6 +71,7 @@
>  #define EVTCHNOP_bind_vcpu        8
>  #define EVTCHNOP_unmask           9
>  #define EVTCHNOP_reset           10
> +#define EVTCHNOP_register_nlevel 11
>  /* ` } */
>  
>  typedef uint32_t evtchn_port_t;
> @@ -258,6 +259,29 @@ struct evtchn_reset {
>  typedef struct evtchn_reset evtchn_reset_t;
>  
>  /*
> + * EVTCHNOP_register_nlevel: Register N level event channels.
> + * NOTES:
> + *   1. currently only 3-level is supported.
> + *   2. should fall back to basic 2-level if this call fails.
> + */
> +#define MAX_L3_PAGES 8

Without explanation, no-one will easily understand whether this
number is made up or the result of some calculation. Also, with it
having "MAX" in its name, I'd conclude there can be fewer than 8
pages passed by the guest, yet the interface lacks any indication
of how many pages there are being passed.

> +struct evtchn_register_3level {
> +    unsigned long evtchn_pending[MAX_L3_PAGES];
> +    unsigned long evtchn_mask[MAX_L3_PAGES];
> +    unsigned long *l2sel_mfn;
> +    unsigned long *l2sel_offset;
> +    uint32_t nr_vcpus;
> +};
> +
> +struct evtchn_register_nlevel {
> +    uint32_t level;
> +    union {
> +        struct evtchn_register_3level l3;
> +    } u;
> +};
> +typedef struct evtchn_register_nlevel evtchn_register_nlevel_t;
> +
> +/*
>   * ` enum neg_errnoval
>   * ` HYPERVISOR_event_channel_op_compat(struct evtchn_op *op)
>   * `
> --- a/xen/include/public/xen.h
> +++ b/xen/include/public/xen.h
> @@ -554,9 +554,24 @@ DEFINE_XEN_GUEST_HANDLE(multicall_entry_t);
>  
>  /*
>   * Event channel endpoints per domain:
> + * 2-level:
>   *  1024 if a long is 32 bits; 4096 if a long is 64 bits.
> + * 3-level:
> + *  32k if a long is 32 bits; 256k if a long is 64 bits;
>   */
> -#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 
> 64)
> +#define NR_EVENT_CHANNELS_L2 (sizeof(unsigned long) * sizeof(unsigned long) 
> * 64)
> +#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * 64)
> +#define NR_EVENT_CHANNELS(x) ({ unsigned int __v = 0;   \

This is not a valid thing to do in a public header: You can't replace
the object like NR_EVENT_CHANNELS with a function like alternative,
you need to retain the old definition for consumers unaware of the
new extension.

> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -64,7 +64,8 @@ extern struct domain *dom0;
>              __v;})
>  
>  #define EVTCHNS_PER_BUCKET 512
> -#define NR_EVTCHN_BUCKETS  (NR_EVENT_CHANNELS / EVTCHNS_PER_BUCKET)
> +#define NR_EVTCHN_BUCKETS  (NR_EVENT_CHANNELS_L3 / EVTCHNS_PER_BUCKET)
> +#define EVTCHNS_PER_PAGE   (PAGE_SIZE * 8)

So is this 8 the same as the one above? If so, why don't you use
the #define? If not, where does this 8 come from?

> @@ -104,7 +105,7 @@ void evtchn_destroy_final(struct domain *d); /* from 
> complete_domain_destroy */
>  
>  struct waitqueue_vcpu;
>  
> -struct vcpu 
> +struct vcpu

???

> @@ -275,6 +279,10 @@ struct domain
>      struct evtchn  **evtchn;
>      spinlock_t       event_lock;
>      unsigned int     evtchn_level;
> +#define L3_PAGES (NR_EVENT_CHANNELS_L3/BITS_PER_LONG*sizeof(unsigned 
> long)/PAGE_SIZE)

Without the use of BITS_TO_LONGS() and PFN_UP() it is not clear
that you don't cut off any non-zero bits here. Hence you ought to
either use those macros, or put a validating BUILD_BUG_ON()
somewhere.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.