[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [RFC PATCH] dpci: Put the dpci back on the list if running on another CPU.



Tuesday, March 17, 2015, 6:44:54 PM, you wrote:


>> >> Additionally I think it should be considered whether the bitmap
>> >> approach of interpreting ->state is the right one, and we don't
>> >> instead want a clean 3-state (idle, sched, run) model.
>> > 
>> > Could you elaborate a bit more please? As in three different unsigned int
>> > (or bool_t) that set in what state we are in?
>> 
>> An enum { STATE_IDLE, STATE_SCHED, STATE_RUN }. Especially
>> if my comment above turns out to be wrong, you'd have no real
>> need for the SCHED and RUN flags to be set at the same time.

> I cobbled what I believe is what you were thinking off.

> As you can see to preserve the existing functionality such as
> being able to schedule N amount of interrupt injections
> for the N interrupts we might get - I modified '->masked'
> to be an atomic counter.

> The end result is that we can still live-lock. Unless we:
>  - Drop on the floor the injection of N interrupts and
>    just deliever at max one per VMX_EXIT (and not bother
>    with interrupts arriving when we are in the VMX handler).

>  - Alter the softirq code slightly - to have an variant
>    which will only iterate once over the pending softirq
>    bits per call. (so save an copy of the bitmap on the
>    stack when entering the softirq handler - and use that.
>    We could also xor it against the current to catch any
>    non-duplicate bits being set that we should deal with).

> Here is the compile, but not run-time tested patch.

> From e7d8bcd7c5d32c520554a4ad69c4716246036002 Mon Sep 17 00:00:00 2001
> From: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
> Date: Tue, 17 Mar 2015 13:31:52 -0400
> Subject: [RFC PATCH] dpci: Switch to tristate instead of bitmap

> *TODO*:
>  - Writeup.
>  - Tests

Done, and unfortunately it doesn't fly ..
Some devices seem to work fine, others don't receive any interrupts shortly 
after boot like:
 40:          3          0          0          0  xen-pirq-ioapic-level  
cx25821[1]

Don't see any crashes or errors though, so it seems to silently lock somewhere.

--
Sander

> Suggested-by: Jan Beulich <jbuelich@xxxxxxxx>
> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
> ---
>  xen/drivers/passthrough/io.c | 140 
> ++++++++++++++++++++++++-------------------
>  xen/include/xen/hvm/irq.h    |   4 +-
>  2 files changed, 82 insertions(+), 62 deletions(-)

> diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
> index ae050df..663e104 100644
> --- a/xen/drivers/passthrough/io.c
> +++ b/xen/drivers/passthrough/io.c
> @@ -30,42 +30,28 @@
>  static DEFINE_PER_CPU(struct list_head, dpci_list);
>  
>  /*
> - * These two bit states help to safely schedule, deschedule, and wait until
> - * the softirq has finished.
> - *
> - * The semantics behind these two bits is as follow:
> - *  - STATE_SCHED - whoever modifies it has to ref-count the domain (->dom).
> - *  - STATE_RUN - only softirq is allowed to set and clear it. If it has
> - *      been set hvm_dirq_assist will RUN with a saved value of the
> - *      'struct domain' copied from 'pirq_dpci->dom' before STATE_RUN was 
> set.
> - *
> - * The usual states are: STATE_SCHED(set) -> STATE_RUN(set) ->
> - * STATE_SCHED(unset) -> STATE_RUN(unset).
> - *
> - * However the states can also diverge such as: STATE_SCHED(set) ->
> - * STATE_SCHED(unset) -> STATE_RUN(set) -> STATE_RUN(unset). That means
> - * the 'hvm_dirq_assist' never run and that the softirq did not do any
> - * ref-counting.
> - */
> -
> -enum {
> -    STATE_SCHED,
> -    STATE_RUN
> -};
> -
> -/*
>   * This can be called multiple times, but the softirq is only raised once.
> - * That is until the STATE_SCHED state has been cleared. The state can be
> - * cleared by: the 'dpci_softirq' (when it has executed 'hvm_dirq_assist'),
> - * or by 'pt_pirq_softirq_reset' (which will try to clear the state before
> + * That is until state is in init. The state can be changed  by:
> + * the 'dpci_softirq' (when it has executed 'hvm_dirq_assist'),
> + * or by 'pt_pirq_softirq_reset' (which will try to init the state before
>   * the softirq had a chance to run).
>   */
>  static void raise_softirq_for(struct hvm_pirq_dpci *pirq_dpci)
>  {
>      unsigned long flags;
>  
> -    if ( test_and_set_bit(STATE_SCHED, &pirq_dpci->state) )
> +    switch ( cmpxchg(&pirq_dpci->state, STATE_INIT, STATE_SCHED) )
> +    {
> +    case STATE_RUN:
> +    case STATE_SCHED:
> +        /*
> +         * The pirq_dpci->mapping has been incremented to let us know
> +         * how many we have left to do.
> +         */
>          return;
> +    case STATE_INIT:
> +        break;
> +    }
>  
>      get_knownalive_domain(pirq_dpci->dom);
>  
> @@ -85,7 +71,7 @@ static void raise_softirq_for(struct hvm_pirq_dpci 
> *pirq_dpci)
>   */
>  bool_t pt_pirq_softirq_active(struct hvm_pirq_dpci *pirq_dpci)
>  {
> -    if ( pirq_dpci->state & ((1 << STATE_RUN) | (1 << STATE_SCHED)) )
> +    if ( pirq_dpci->state != STATE_INIT )
>          return 1;
>  
>      /*
> @@ -109,22 +95,22 @@ static void pt_pirq_softirq_reset(struct hvm_pirq_dpci 
> *pirq_dpci)
>  
>      ASSERT(spin_is_locked(&d->event_lock));
>  
> -    switch ( cmpxchg(&pirq_dpci->state, 1 << STATE_SCHED, 0) )
> +    switch ( cmpxchg(&pirq_dpci->state, STATE_SCHED, STATE_INIT) )
>      {
> -    case (1 << STATE_SCHED):
> +    case STATE_SCHED:
>          /*
> -         * We are going to try to de-schedule the softirq before it goes in
> -         * STATE_RUN. Whoever clears STATE_SCHED MUST refcount the 'dom'.
> +         * We are going to try to de-schedule the softirq before it goes to
> +         * running state. Whoever moves from sched MUST refcount the 'dom'.
>           */
>          put_domain(d);
>          /* fallthrough. */
> -    case (1 << STATE_RUN):
> -    case (1 << STATE_RUN) | (1 << STATE_SCHED):
> +    case STATE_RUN:
> +    case STATE_INIT:
>          /*
> -         * The reason it is OK to reset 'dom' when STATE_RUN bit is set is 
> due
> +         * The reason it is OK to reset 'dom' when init or running is set is 
> due
>           * to a shortcut the 'dpci_softirq' implements. It stashes the 'dom'
> -         * in local variable before it sets STATE_RUN - and therefore will 
> not
> -         * dereference '->dom' which would crash.
> +         * in local variable before it sets state to running - and therefore
> +         * will not dereference '->dom' which would crash.
>           */
>          pirq_dpci->dom = NULL;
>          break;
> @@ -135,7 +121,7 @@ static void pt_pirq_softirq_reset(struct hvm_pirq_dpci 
> *pirq_dpci)
>       * or never initialized it). Note that we hold the lock that
>       * 'hvm_dirq_assist' could be spinning on.
>       */
-    pirq_dpci->>masked = 0;
> +    atomic_set(&pirq_dpci->masked, 0);
>  }
>  
>  bool_t pt_irq_need_timer(uint32_t flags)
> @@ -149,7 +135,8 @@ static int pt_irq_guest_eoi(struct domain *d, struct 
> hvm_pirq_dpci *pirq_dpci,
>      if ( __test_and_clear_bit(_HVM_IRQ_DPCI_EOI_LATCH_SHIFT,
>                                &pirq_dpci->flags) )
>      {
-        pirq_dpci->>masked = 0;
> +        ASSERT(atomic_read(&pirq_dpci->masked) <= 1);
> +        atomic_set(&pirq_dpci->masked, 0);
>          pirq_dpci->pending = 0;
>          pirq_guest_eoi(dpci_pirq(pirq_dpci));
>      }
> @@ -278,6 +265,8 @@ int pt_irq_create_bind(
>               * As such on every 'pt_irq_create_bind' call we MUST set it.
>               */
>              pirq_dpci->dom = d;
> +            atomic_set(&pirq_dpci->masked, 0);
> +            pirq_dpci->state = STATE_INIT;
>              /* bind after hvm_irq_dpci is setup to avoid race with irq 
> handler*/
>              rc = pirq_guest_bind(d->vcpu[0], info, 0);
>              if ( rc == 0 && pt_irq_bind->u.msi.gtable )
> @@ -398,6 +387,8 @@ int pt_irq_create_bind(
>              if ( pt_irq_need_timer(pirq_dpci->flags) )
>                  init_timer(&pirq_dpci->timer, pt_irq_time_out, pirq_dpci, 0);
>              /* Deal with gsi for legacy devices */
> +            atomic_set(&pirq_dpci->masked, 0);
> +            pirq_dpci->state = STATE_INIT;
>              rc = pirq_guest_bind(d->vcpu[0], info, share);
>              if ( unlikely(rc) )
>              {
> @@ -576,6 +567,7 @@ bool_t pt_pirq_cleanup_check(struct hvm_pirq_dpci *dpci)
>      if ( !dpci->flags && !pt_pirq_softirq_active(dpci) )
>      {
>          dpci->dom = NULL;
+        dpci->>state = STATE_INIT;
>          return 1;
>      }
>      return 0;
> @@ -617,7 +609,7 @@ int hvm_do_IRQ_dpci(struct domain *d, struct pirq *pirq)
>           !(pirq_dpci->flags & HVM_IRQ_DPCI_MAPPED) )
>          return 0;
>  
-    pirq_dpci->>masked = 1;
> +    atomic_inc(&pirq_dpci->masked);
>      raise_softirq_for(pirq_dpci);
>      return 1;
>  }
> @@ -672,12 +664,13 @@ void hvm_dpci_msi_eoi(struct domain *d, int vector)
>      spin_unlock(&d->event_lock);
>  }
>  
> -static void hvm_dirq_assist(struct domain *d, struct hvm_pirq_dpci 
> *pirq_dpci)
> +static bool_t hvm_dirq_assist(struct domain *d, struct hvm_pirq_dpci 
> *pirq_dpci)
>  {
> +    int more;
>      ASSERT(d->arch.hvm_domain.irq.dpci);
>  
>      spin_lock(&d->event_lock);
> -    if ( test_and_clear_bool(pirq_dpci->masked) )
> +    while ( (more = !atomic_dec_and_test(&pirq_dpci->masked)) )
>      {
>          struct pirq *pirq = dpci_pirq(pirq_dpci);
>          const struct dev_intx_gsi_link *digl;
> @@ -687,17 +680,13 @@ static void hvm_dirq_assist(struct domain *d, struct 
> hvm_pirq_dpci *pirq_dpci)
>              send_guest_pirq(d, pirq);
>  
>              if ( pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI )
> -            {
> -                spin_unlock(&d->event_lock);
> -                return;
> -            }
> +                break;
>          }
>  
>          if ( pirq_dpci->flags & HVM_IRQ_DPCI_GUEST_MSI )
>          {
>              vmsi_deliver_pirq(d, pirq_dpci);
> -            spin_unlock(&d->event_lock);
> -            return;
> +            break;
>          }
>  
>          list_for_each_entry ( digl, &pirq_dpci->digl_list, list )
> @@ -710,8 +699,7 @@ static void hvm_dirq_assist(struct domain *d, struct 
> hvm_pirq_dpci *pirq_dpci)
>          {
>              /* for translated MSI to INTx interrupt, eoi as early as 
> possible */
>              __msi_pirq_eoi(pirq_dpci);
> -            spin_unlock(&d->event_lock);
> -            return;
> +            break;
>          }
>  
>          /*
> @@ -725,6 +713,8 @@ static void hvm_dirq_assist(struct domain *d, struct 
> hvm_pirq_dpci *pirq_dpci)
>          set_timer(&pirq_dpci->timer, NOW() + PT_IRQ_TIME_OUT);
>      }
>      spin_unlock(&d->event_lock);
> +
> +    return more;
>  }
>  
>  static void __hvm_dpci_eoi(struct domain *d,
> @@ -781,7 +771,7 @@ unlock:
>  }
>  
>  /*
> - * Note: 'pt_pirq_softirq_reset' can clear the STATE_SCHED before we get to
> + * Note: 'pt_pirq_softirq_reset' can move the state to init before we get to
>   * doing it. If that is the case we let 'pt_pirq_softirq_reset' do 
> ref-counting.
>   */
>  static void dpci_softirq(void)
> @@ -797,23 +787,53 @@ static void dpci_softirq(void)
>      {
>          struct hvm_pirq_dpci *pirq_dpci;
>          struct domain *d;
> +        bool_t again = 0;
>  
>          pirq_dpci = list_entry(our_list.next, struct hvm_pirq_dpci, 
> softirq_list);
>          list_del(&pirq_dpci->softirq_list);
>  
>          d = pirq_dpci->dom;
>          smp_mb(); /* 'd' MUST be saved before we set/clear the bits. */
> -        if ( test_and_set_bit(STATE_RUN, &pirq_dpci->state) )
> -            BUG();
> -        /*
> -         * The one who clears STATE_SCHED MUST refcount the domain.
> -         */
> -        if ( test_and_clear_bit(STATE_SCHED, &pirq_dpci->state) )
> +
> +        switch ( cmpxchg(&pirq_dpci->state, STATE_SCHED, STATE_RUN) )
> +        {
> +        case STATE_INIT:
> +            /*  pt_pirq_softirq_reset cleared it. Let it do the 
> ref-counting. */
> +            continue;
> +        case STATE_RUN:
> +            again = 1;
> +            /* Fall through. */
> +        case STATE_SCHED:
> +            break;
> +        }
> +        if ( !again )
>          {
> -            hvm_dirq_assist(d, pirq_dpci);
> +            /*
> +            * The one who changes sched to running MUST refcount the domain.
> +            */
> +            again = hvm_dirq_assist(d, pirq_dpci);
>              put_domain(d);
> +            switch ( cmpxchg(&pirq_dpci->state, STATE_RUN, STATE_INIT) )
> +            {
> +            case STATE_SCHED:
> +            case STATE_INIT:
> +                BUG();
> +            case STATE_RUN:
> +                break;
> +            }
> +        }
> +        if ( again )
> +        {
> +            unsigned long flags;
> +
> +            /* Put back on the list and retry. */
> +            local_irq_save(flags);
> +            list_add_tail(&pirq_dpci->softirq_list, &this_cpu(dpci_list));
> +            local_irq_restore(flags);
> +
> +            raise_softirq(HVM_DPCI_SOFTIRQ);
> +            continue;
>          }
> -        clear_bit(STATE_RUN, &pirq_dpci->state);
>      }
>  }
>  
> diff --git a/xen/include/xen/hvm/irq.h b/xen/include/xen/hvm/irq.h
> index 3996f1f..48dbc51 100644
> --- a/xen/include/xen/hvm/irq.h
> +++ b/xen/include/xen/hvm/irq.h
> @@ -93,8 +93,8 @@ struct hvm_irq_dpci {
>  /* Machine IRQ to guest device/intx mapping. */
>  struct hvm_pirq_dpci {
>      uint32_t flags;
> -    unsigned int state;
> -    bool_t masked;
> +    enum { STATE_INIT, STATE_SCHED, STATE_RUN } state;
> +    atomic_t masked;
>      uint16_t pending;
>      struct list_head digl_list;
>      struct domain *dom;


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.