[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v6 16/18] vmx: Add some scheduler hooks for VT-d posted interrupts



>>> On 25.08.15 at 03:57, <feng.wu@xxxxxxxxx> wrote:
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -1573,6 +1573,22 @@ static void __context_switch(void)
>      per_cpu(curr_vcpu, cpu) = n;
>  }
>  
> +static inline void pi_ctxt_switch_from(struct vcpu *prev)
> +{
> +    /*
> +     * When switching from non-idle to idle, we only do a lazy context 
> switch.
> +     * However, in order for posted interrupt (if available and enabled) to
> +     * work properly, we at least need to update the descriptors.
> +     */
> +    if ( prev->arch.pi_ctxt_switch_from && !is_idle_vcpu(prev) )
> +        prev->arch.pi_ctxt_switch_from(prev);
> +}
> +
> +static inline void pi_ctxt_switch_to(struct vcpu *next)
> +{
> +    if ( next->arch.pi_ctxt_switch_to && !is_idle_vcpu(next) )
> +        next->arch.pi_ctxt_switch_to(next);
> +}
>  
>  void context_switch(struct vcpu *prev, struct vcpu *next)
>  {
> @@ -1605,9 +1621,12 @@ void context_switch(struct vcpu *prev, struct vcpu 
> *next)
>  
>      set_current(next);
>  
> +    pi_ctxt_switch_from(prev);
> +
>      if ( (per_cpu(curr_vcpu, cpu) == next) ||
>           (is_idle_domain(nextd) && cpu_online(cpu)) )
>      {
> +        pi_ctxt_switch_to(next);
>          local_irq_enable();

This placement, if really intended that way, needs explanation (in a
comment) and perhaps even renaming of the involved symbols, as
looking at it from a general perspective it seems wrong (with
pi_ctxt_switch_to() excluding idle vCPU-s it effectively means you
want this only when switching back to what got switched out lazily
before, i.e. this would be not something to take place on an arbitrary
context switch). As to possible alternative names - maybe make the
hooks ctxt_switch_prepare() and ctxt_switch_cancel()?

> @@ -117,10 +119,20 @@ static int vmx_vcpu_initialise(struct vcpu *v)
>      INIT_LIST_HEAD(&v->arch.hvm_vmx.pi_blocked_vcpu_list);
>      INIT_LIST_HEAD(&v->arch.hvm_vmx.pi_vcpu_on_set_list);
>  
> +    v->arch.hvm_vmx.pi_block_cpu = -1;
> +
> +    spin_lock_init(&v->arch.hvm_vmx.pi_lock);
> +
>      v->arch.schedule_tail    = vmx_do_resume;
>      v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
>      v->arch.ctxt_switch_to   = vmx_ctxt_switch_to;
>  
> +    if ( iommu_intpost && is_hvm_vcpu(v) )
> +    {
> +        v->arch.pi_ctxt_switch_from = vmx_pre_ctx_switch_pi;
> +        v->arch.pi_ctxt_switch_to = vmx_post_ctx_switch_pi;
> +    }

Why conditional upon is_hvm_vcpu()?

> @@ -718,6 +730,140 @@ static void vmx_fpu_leave(struct vcpu *v)
>      }
>  }
>  
> +void arch_vcpu_wake_prepare(struct vcpu *v)

A non-static function with this name should not live in vmx.c.

> +{
> +    unsigned long gflags;

Just "flags" please.

> +    if ( !iommu_intpost || !is_hvm_vcpu(v) || !has_arch_pdevs(v->domain) )

DYM !has_hvm_container_vcpu()?

> +        return;
> +
> +    spin_lock_irqsave(&v->arch.hvm_vmx.pi_lock, gflags);
> +
> +    if ( likely(vcpu_runnable(v)) ||
> +         !test_bit(_VPF_blocked, &v->pause_flags) )

_VPF_blocked set implies !vcpu_runnable(), i.e. afaict just the
latter check would suffice if this is really what you mean.

> +    {
> +        struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
> +        unsigned long flags;

You don't need this - you already know IRQs are off for the lock
acquire below.

> +        /*
> +         * We don't need to send notification event to a non-running
> +         * vcpu, the interrupt information will be delivered to it before
> +         * VM-ENTRY when the vcpu is scheduled to run next time.
> +         */
> +        pi_set_sn(pi_desc);
> +
> +        /*
> +         * Set 'NV' field back to posted_intr_vector, so the
> +         * Posted-Interrupts can be delivered to the vCPU by
> +         * VT-d HW after it is scheduled to run.
> +         */
> +        write_atomic((uint8_t*)&pi_desc->nv, posted_intr_vector);

Bogus cast.

> +
> +        /*
> +         * Delete the vCPU from the related block list
> +         * if we are resuming from blocked state.
> +         */
> +        if ( v->arch.hvm_vmx.pi_block_cpu != -1 )
> +        {
> +            spin_lock_irqsave(&per_cpu(pi_blocked_vcpu_lock,
> +                              v->arch.hvm_vmx.pi_block_cpu), flags);
> +            list_del_init(&v->arch.hvm_vmx.pi_blocked_vcpu_list);

Shouldn't you set .pi_block_cpu back to -1 along with removing
the vCPU from the list? Which then ought to allow using just
list_del() here?

> +static void vmx_pre_ctx_switch_pi(struct vcpu *v)
> +{
> +    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
> +    struct pi_desc old, new;

Please limit the scope of these two variables.

> +    unsigned long flags, gflags;

See above.

> +    if ( !has_arch_pdevs(v->domain) )
> +        return;
> +
> +    spin_lock_irqsave(&v->arch.hvm_vmx.pi_lock, gflags);
> +
> +    if ( vcpu_runnable(v) || !test_bit(_VPF_blocked, &v->pause_flags) )

See above.

> +    {
> +        /*
> +         * The vCPU has been preempted or went to sleep. We don't need to 
> send
> +         * notification event to a non-running vcpu, the interrupt 
> information
> +         * will be delivered to it before VM-ENTRY when the vcpu is scheduled
> +         * to run next time.
> +         */
> +        pi_set_sn(pi_desc);
> +
> +    }
> +    else if ( test_bit(_VPF_blocked, &v->pause_flags) )

The condition here is redundant with the if() one above.

> +    {
> +        /*
> +         * The vCPU is blocking, we need to add it to one of the per pCPU 
> lists.
> +         * We save v->processor to v->arch.hvm_vmx.pi_block_cpu and use it 
> for
> +         * the per-CPU list, we also save it to posted-interrupt descriptor 
> and
> +         * make it as the destination of the wake-up notification event.
> +         */
> +        v->arch.hvm_vmx.pi_block_cpu = v->processor;
> +        spin_lock_irqsave(&per_cpu(pi_blocked_vcpu_lock,
> +                          v->arch.hvm_vmx.pi_block_cpu), flags);
> +        list_add_tail(&v->arch.hvm_vmx.pi_blocked_vcpu_list,
> +                      &per_cpu(pi_blocked_vcpu, 
> v->arch.hvm_vmx.pi_block_cpu));
> +        spin_unlock_irqrestore(&per_cpu(pi_blocked_vcpu_lock,
> +                           v->arch.hvm_vmx.pi_block_cpu), flags);
> +
> +        do {
> +            old.control = new.control = pi_desc->control;
> +
> +            /* Should not block the vCPU if an interrupt was posted for it. 
> */
> +            if ( pi_test_on(&old) )
> +            {
> +                spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_lock, gflags);
> +                vcpu_unblock(v);

Calling vcpu_unblock() in the middle of context_switch()? Why? And
is this safe? And if really needed to cover something not dealt with
elsewhere, wouldn't this need to happen _after_ having switched
the notification mechanism below?

> +                return;
> +            }
> +
> +            /*
> +             * Change the 'NDST' field to v->arch.hvm_vmx.pi_block_cpu,
> +             * so when external interrupts from assigned deivces happen,
> +             * wakeup notifiction event will go to
> +             * v->arch.hvm_vmx.pi_block_cpu, then in pi_wakeup_interrupt()
> +             * we can find the vCPU in the right list to wake up.
> +             */
> +            if ( x2apic_enabled )
> +                new.ndst = cpu_physical_id(v->arch.hvm_vmx.pi_block_cpu);
> +            else
> +                new.ndst = MASK_INSR(cpu_physical_id(
> +                                 v->arch.hvm_vmx.pi_block_cpu),
> +                                 PI_xAPIC_NDST_MASK);

Indentation is screwed up here. Perhaps it would help if you latched
cpu_phyiscal_id() into a local variable and used it on both branches?

> +            pi_clear_sn(&new);
> +            new.nv = pi_wakeup_vector;
> +        } while ( cmpxchg(&pi_desc->control, old.control, new.control)
> +                  != old.control );

Operators belong on the earlier of the split up lines.

> +static void vmx_post_ctx_switch_pi(struct vcpu *v)
> +{
> +    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
> +
> +    if ( !has_arch_pdevs(v->domain) )
> +        return;
> +
> +    if ( x2apic_enabled )
> +        write_atomic(&pi_desc->ndst, cpu_physical_id(v->processor));
> +    else
> +        write_atomic(&pi_desc->ndst,
> +                     MASK_INSR(cpu_physical_id(v->processor),
> +                     PI_xAPIC_NDST_MASK));
> +
> +    pi_clear_sn(pi_desc);
> +}

So you alter where notifications go, but not via which vector? How
is the vCPU going to get removed from the blocked list then?

> @@ -756,6 +902,7 @@ static void vmx_ctxt_switch_to(struct vcpu *v)
>  
>      vmx_restore_guest_msrs(v);
>      vmx_restore_dr(v);
> +    vmx_post_ctx_switch_pi(v);
>  }

Ah, here is the other invocation! But no, this shouldn't be done this
way. This really belongs into vendor independent code, even if the
indirect call overhead is slightly higher.

> --- a/xen/include/asm-x86/hvm/vmx/vmcs.h
> +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
> @@ -164,6 +164,14 @@ struct arch_vmx_struct {
>  
>      struct list_head     pi_blocked_vcpu_list;
>      struct list_head     pi_vcpu_on_set_list;
> +
> +    /*
> +     * Before vCPU is blocked, it is added to the global per-cpu list
> +     * of 'pi_block_cpu', then VT-d engine can send wakeup notification
> +     * event to 'pi_block_cpu' and wakeup the related vCPU.
> +     */
> +    int                  pi_block_cpu;

I can see that using int is in line with storing -1 into the field. But
generally CPU numbers should be unsigned. Please make it so,
using NR_CPUS or UINT_MAX in place of the -1.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.