|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v2 1/5] tasklet: Introduce per-cpu tasklet for softirq.
This implements a lockless per-cpu tasklet mechanism.
The existing tasklet mechanism has a single global
spinlock that is taken every-time the global list
is touched. And we use this lock quite a lot - when
we call do_tasklet_work which is called via an softirq
and from the idle loop. We take the lock on any
operation on the tasklet_list.
The problem we are facing is that there are quite a lot of
tasklets scheduled. The most common one that is invoked is
the one injecting the VIRQ_TIMER in the guest. Guests
are not insane and don't set the one-shot or periodic
clocks to be in sub 1ms intervals (causing said tasklet
to be scheduled for such small intervalls).
This is especially an problem with guests that have a
large amount of VCPUs.
With this patch the problem disappears.
As such this patch introduces the code to setup
softirq per-cpu tasklets and only modifies the PCI
passthrough cases instead of doing it wholesale. This
is done because:
- We want to easily bisect it if things break.
- We modify the code one section at a time to
make it easier to review this core code.
Now on the code itself. The Linux code (softirq.c)
has an per-cpu implementation of tasklets on which
this was based on. However there are differences:
- This patch executes one tasklet at a time - similar
to how the existing implementation does it.
- We use a double-linked list instead of a single linked
list. We could use a single-linked list but folks are
more familiar with 'list_*' type macros.
- This patch does not have the cross-CPU feeders
implemented. That code is in the patch
titled: tasklet: Add cross CPU feeding of per-cpu
tasklets. This is done to support:
"tasklet_schedule_on_cpu"
- We add an temporary 'TASKLET_SOFTIRQ_PERCPU' which
is can co-exist with the TASKLET_SOFTIRQ. It will be
replaced in 'tasklet: Remove the old-softirq
implementation."
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>
---
RFC: First version
v1: Posted, folks asked if ticketlocks fixed it.
v2: Intel confirmed at XPDS 2016 that the problem is still present
with large guests.
Cc: Jan Beulich <jbeulich@xxxxxxxx>
Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
Cc: "Lan, Tianyu" <tianyu.lan@xxxxxxxxx>
Cc: Kevin Tian <kevin.tian@xxxxxxxxx>
Cc: Jun Nakajima <jun.nakajima@xxxxxxxxx>
---
xen/arch/x86/hvm/hvm.c | 2 +-
xen/common/tasklet.c | 129 ++++++++++++++++++++++++++++++++++++++++++++--
xen/include/xen/softirq.h | 1 +
xen/include/xen/tasklet.h | 61 ++++++++++++++++++++--
4 files changed, 183 insertions(+), 10 deletions(-)
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 0180f26..d933ddd 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1511,7 +1511,7 @@ int hvm_vcpu_initialise(struct vcpu *v)
if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 ) /* teardown:
hvm_funcs.vcpu_destroy */
goto fail3;
- softirq_tasklet_init(
+ percpu_tasklet_init(
&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet,
(void(*)(unsigned long))hvm_assert_evtchn_irq,
(unsigned long)v);
diff --git a/xen/common/tasklet.c b/xen/common/tasklet.c
index 4e42fa7..319866f 100644
--- a/xen/common/tasklet.c
+++ b/xen/common/tasklet.c
@@ -31,10 +31,30 @@ static DEFINE_PER_CPU(struct list_head,
softirq_tasklet_list);
/* Protects all lists and tasklet structures. */
static DEFINE_SPINLOCK(tasklet_lock);
+static DEFINE_PER_CPU(struct list_head, softirq_list);
+
static void tasklet_enqueue(struct tasklet *t)
{
unsigned int cpu = t->scheduled_on;
+ if ( t->is_percpu )
+ {
+ unsigned long flags;
+ struct list_head *list;
+
+ INIT_LIST_HEAD(&t->list);
+ BUG_ON( !t->is_softirq );
+ BUG_ON( cpu != smp_processor_id() ); /* Not implemented yet. */
+
+ local_irq_save(flags);
+
+ list = &__get_cpu_var(softirq_list);
+ list_add_tail(&t->list, list);
+ raise_softirq(TASKLET_SOFTIRQ_PERCPU);
+
+ local_irq_restore(flags);
+ return;
+ }
if ( t->is_softirq )
{
struct list_head *list = &per_cpu(softirq_tasklet_list, cpu);
@@ -56,16 +76,25 @@ void tasklet_schedule_on_cpu(struct tasklet *t, unsigned
int cpu)
{
unsigned long flags;
- spin_lock_irqsave(&tasklet_lock, flags);
+ if ( !tasklets_initialised || t->is_dead )
+ return;
- if ( tasklets_initialised && !t->is_dead )
+ if ( t->is_percpu )
{
- t->scheduled_on = cpu;
- if ( !t->is_running )
+ if ( !test_and_set_bit(TASKLET_STATE_SCHED, &t->state) )
{
- list_del(&t->list);
+ t->scheduled_on = cpu;
tasklet_enqueue(t);
}
+ return;
+ }
+ spin_lock_irqsave(&tasklet_lock, flags);
+
+ t->scheduled_on = cpu;
+ if ( !t->is_running )
+ {
+ list_del(&t->list);
+ tasklet_enqueue(t);
}
spin_unlock_irqrestore(&tasklet_lock, flags);
@@ -104,6 +133,66 @@ static void do_tasklet_work(unsigned int cpu, struct
list_head *list)
}
}
+void do_tasklet_work_percpu(void)
+{
+ struct tasklet *t = NULL;
+ struct list_head *head;
+ bool_t poke = 0;
+
+ local_irq_disable();
+ head = &__get_cpu_var(softirq_list);
+
+ if ( !list_empty(head) )
+ {
+ t = list_entry(head->next, struct tasklet, list);
+
+ if ( head->next == head->prev ) /* One singular item. Re-init head. */
+ INIT_LIST_HEAD(&__get_cpu_var(softirq_list));
+ else
+ {
+ /* Multiple items, update head to skip 't'. */
+ struct list_head *list;
+
+ /* One item past 't'. */
+ list = head->next->next;
+
+ BUG_ON(list == NULL);
+
+ /* And update head to skip 't'. Note that t->list.prev still
+ * points to head, but we don't care as we only process one tasklet
+ * and once done the tasklet list is re-init one way or another.
+ */
+ head->next = list;
+ poke = 1;
+ }
+ }
+ local_irq_enable();
+
+ if ( !t )
+ return; /* Never saw it happend, but we might have a spurious case? */
+
+ if ( tasklet_trylock(t) )
+ {
+ if ( !test_and_clear_bit(TASKLET_STATE_SCHED, &t->state) )
+ BUG();
+ sync_local_execstate();
+ t->func(t->data);
+ tasklet_unlock(t);
+ if ( poke )
+ raise_softirq(TASKLET_SOFTIRQ_PERCPU);
+ /* We could reinit the t->list but tasklet_enqueue does it for us. */
+ return;
+ }
+
+ local_irq_disable();
+
+ INIT_LIST_HEAD(&t->list);
+ list_add_tail(&t->list, &__get_cpu_var(softirq_list));
+ smp_wmb();
+ raise_softirq(TASKLET_SOFTIRQ_PERCPU);
+ local_irq_enable();
+}
+
/* VCPU context work */
void do_tasklet(void)
{
@@ -147,10 +236,29 @@ static void tasklet_softirq_action(void)
spin_unlock_irq(&tasklet_lock);
}
+/* Per CPU softirq context work. */
+static void tasklet_softirq_percpu_action(void)
+{
+ do_tasklet_work_percpu();
+}
+
void tasklet_kill(struct tasklet *t)
{
unsigned long flags;
+ if ( t->is_percpu )
+ {
+ while ( test_and_set_bit(TASKLET_STATE_SCHED, &t->state) )
+ {
+ do {
+ process_pending_softirqs();
+ } while ( test_bit(TASKLET_STATE_SCHED, &t->state) );
+ }
+ tasklet_unlock_wait(t);
+ clear_bit(TASKLET_STATE_SCHED, &t->state);
+ t->is_dead = 1;
+ return;
+ }
spin_lock_irqsave(&tasklet_lock, flags);
if ( !list_empty(&t->list) )
@@ -208,6 +316,14 @@ void softirq_tasklet_init(
t->is_softirq = 1;
}
+void percpu_tasklet_init(
+ struct tasklet *t, void (*func)(unsigned long), unsigned long data)
+{
+ tasklet_init(t, func, data);
+ t->is_softirq = 1;
+ t->is_percpu = 1;
+}
+
static int cpu_callback(
struct notifier_block *nfb, unsigned long action, void *hcpu)
{
@@ -218,11 +334,13 @@ static int cpu_callback(
case CPU_UP_PREPARE:
INIT_LIST_HEAD(&per_cpu(tasklet_list, cpu));
INIT_LIST_HEAD(&per_cpu(softirq_tasklet_list, cpu));
+ INIT_LIST_HEAD(&per_cpu(softirq_list, cpu));
break;
case CPU_UP_CANCELED:
case CPU_DEAD:
migrate_tasklets_from_cpu(cpu, &per_cpu(tasklet_list, cpu));
migrate_tasklets_from_cpu(cpu, &per_cpu(softirq_tasklet_list, cpu));
+ migrate_tasklets_from_cpu(cpu, &per_cpu(softirq_list, cpu));
break;
default:
break;
@@ -242,6 +360,7 @@ void __init tasklet_subsys_init(void)
cpu_callback(&cpu_nfb, CPU_UP_PREPARE, hcpu);
register_cpu_notifier(&cpu_nfb);
open_softirq(TASKLET_SOFTIRQ, tasklet_softirq_action);
+ open_softirq(TASKLET_SOFTIRQ_PERCPU, tasklet_softirq_percpu_action);
tasklets_initialised = 1;
}
diff --git a/xen/include/xen/softirq.h b/xen/include/xen/softirq.h
index 0895a16..6dda1ba 100644
--- a/xen/include/xen/softirq.h
+++ b/xen/include/xen/softirq.h
@@ -7,6 +7,7 @@ enum {
SCHEDULE_SOFTIRQ,
NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ,
RCU_SOFTIRQ,
+ TASKLET_SOFTIRQ_PERCPU,
TASKLET_SOFTIRQ,
NR_COMMON_SOFTIRQS
};
diff --git a/xen/include/xen/tasklet.h b/xen/include/xen/tasklet.h
index 8c3de7e..9497c47 100644
--- a/xen/include/xen/tasklet.h
+++ b/xen/include/xen/tasklet.h
@@ -17,21 +17,24 @@
struct tasklet
{
struct list_head list;
+ unsigned long state;
int scheduled_on;
bool_t is_softirq;
bool_t is_running;
bool_t is_dead;
+ bool_t is_percpu;
void (*func)(unsigned long);
unsigned long data;
};
-#define _DECLARE_TASKLET(name, func, data, softirq) \
+#define _DECLARE_TASKLET(name, func, data, softirq, percpu) \
struct tasklet name = { \
- LIST_HEAD_INIT(name.list), -1, softirq, 0, 0, func, data }
+ LIST_HEAD_INIT(name.list), 0, -1, softirq, 0, 0, percpu, \
+ func, data }
#define DECLARE_TASKLET(name, func, data) \
- _DECLARE_TASKLET(name, func, data, 0)
+ _DECLARE_TASKLET(name, func, data, 0, 0)
#define DECLARE_SOFTIRQ_TASKLET(name, func, data) \
- _DECLARE_TASKLET(name, func, data, 1)
+ _DECLARE_TASKLET(name, func, data, 1, 0)
/* Indicates status of tasklet work on each CPU. */
DECLARE_PER_CPU(unsigned long, tasklet_work_to_do);
@@ -40,6 +43,54 @@ DECLARE_PER_CPU(unsigned long, tasklet_work_to_do);
#define TASKLET_enqueued (1ul << _TASKLET_enqueued)
#define TASKLET_scheduled (1ul << _TASKLET_scheduled)
+/* These fancy bit manipulations (bit 0 and bit 1) along with using a lock
+ * operation allow us to have four stages in tasklet life-time.
+ * a) 0x0: Completely empty (not scheduled nor running).
+ * b) 0x1: Scheduled but not running. Used to guard in 'tasklet_schedule'
+ * such that we will only schedule one. If it is scheduled and had never
+ * run (hence never clearing STATE_SCHED bit), tasklet_kill will spin
+ * forever on said tasklet. However 'tasklet_schedule' raises the
+ * softirq associated with the per-cpu - so it will run, albeit there might
+ * be a race (tasklet_kill spinning until the softirq handler runs).
+ * c) 0x2: it is running (only on one CPU) and can be scheduled on any
+ * CPU. The bit 0 - scheduled is cleared at this stage allowing
+ * 'tasklet_schedule' to succesfully schedule.
+ * d) 0x3: scheduled and running - only possible if the running tasklet calls
+ * tasklet_schedule (on same CPU) or the tasklet is scheduled from another
+ * CPU while the tasklet is running on another CPU.
+ *
+ * The two bits play a vital role in assuring that the tasklet is scheduled
+ * once and runs only once. The steps are:
+ *
+ * 1) tasklet_schedule: STATE_SCHED bit set (0x1), added on the per cpu list.
+ * 2) tasklet_softirq_percpu_action picks one tasklet from the list. Schedules
+ * itself later if there are more tasklets on it. Tries to set STATE_RUN bit
+ * (0x3) - if it fails adds tasklet back to the per-cpu list. If it succeeds
+ * clears the STATE_SCHED bit (0x2). Once tasklet completed, unsets STATE_RUN
+ * (0x0 or 0x1 if tasklet called tasklet_schedule).
+ */
+enum {
+ TASKLET_STATE_SCHED, /* Bit 0 */
+ TASKLET_STATE_RUN
+};
+
+static inline int tasklet_trylock(struct tasklet *t)
+{
+ return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state);
+}
+
+static inline void tasklet_unlock(struct tasklet *t)
+{
+ barrier();
+ clear_bit(TASKLET_STATE_RUN, &(t)->state);
+}
+static inline void tasklet_unlock_wait(struct tasklet *t)
+{
+ while (test_bit(TASKLET_STATE_RUN, &(t)->state))
+ {
+ barrier();
+ }
+}
void tasklet_schedule_on_cpu(struct tasklet *t, unsigned int cpu);
void tasklet_schedule(struct tasklet *t);
void do_tasklet(void);
@@ -48,6 +99,8 @@ void tasklet_init(
struct tasklet *t, void (*func)(unsigned long), unsigned long data);
void softirq_tasklet_init(
struct tasklet *t, void (*func)(unsigned long), unsigned long data);
+void percpu_tasklet_init(
+ struct tasklet *t, void (*func)(unsigned long), unsigned long data);
void tasklet_subsys_init(void);
#endif /* __XEN_TASKLET_H__ */
--
2.4.11
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |