Xen project Mailing List

[Xen-devel] [PATCH] xen: Credit2: stop hitting a BU_ON() when creating a Credit2 cpupool

From: Dario Faggioli <dario.faggioli@xxxxxxxxxx>

Date: Sat, 30 Sep 2017 03:19:15 +0200

Cc: Julien Grall <julien.grall@xxxxxxx>, George Dunlap <george.dunlap@xxxxxxxxxx>, Anshul Makkar <anshulmakkar@xxxxxxxxx>

Delivery-date: Sat, 30 Sep 2017 01:19:39 +0000

List-id: Xen developer discussion <xen-devel.lists.xen.org>

This fixes hitting this BUG_ON(): xen/common/sched_credit2.c:3452 rqd = c2rqd(ops, cpu); BUG_ON(!cpumask_test_cpu(cpu, &rqd->active)); when Credit2 is used as default scheduler, and a cpupool, also using Credit2, is created. The bug hit because c2rqd() accesses rqd[per_cpu(runq_map,cpu)]->active when the value of runq_map for this cpu is -1 (and so, we basically access random memory). Bug was introduced in a2c4e5ab ("xen: credit2: make the cpu to runqueue map per-cpu"). In fact, in that commit the CPU to runq map became a global data structure (instead of a per-scheduler array, as it was before that commit). So, when a CPU is deassigned from a scheduler, its corresponding value in the runq map is set to -1. And this makes things explode, in case the CPU was also being assigned to another scheduler. Basically, when moving a CPU from a scheduler (cpupool) to another, we call: - csched2_init_pdata(new_ops, cpu); | --> per_cpu(runq_map,cpu)= _new_runq_id_ ; - csched2_deinit_pdata(old_ops, cpu); | --> per_cpu(runq_map,cpu)= -1; The solution is to put the information of to which runqueue a CPU belongs in a 'csched2_pcpu' data structure (as, e.g., Credit does, for all the per-pcpu pieces of information that it needs to keep), and use the sched_priv field of the per_cpu(schedule_data) structure, to reach out to it. This solution has the following properties: - it prevents Xen from crashing; :-) - it's more in line with how the schedule.c<-->sched_foo.c interface has been designed, and is supposed to be used (that sched_priv field, is specifically meant to be used for this kind of data); - the improvement brought by a2c4e5ab is retained. In fact, the fact that each instance of Credit2 was (needlessly) keeping a large array of int-s, for CPU to runq mapping, which that commit made unnecessary, is *not* reintroduced. Actually, we use even less memory, as instead of a per-cpu set of int-s (which means there were one even for the CPUs which don't belong to a Credit2 instance), we allocate a 1-field-only (an int) struct, only for the CPUs that belong to a Credit2 instance. Note that this, also: * allows the init_pdata() internal helper function to return void, simplifying its callers (and making it more similar to its counterparts in other schedulers); * enables more extensive use of the c2rqd() helper, making the code more uniform and easier to read; * requires the hook csched2_{alloc,free}_pdata to be defined for Credit2; * requires the assignment of new values to 'scheduler' and 'schedule_data.sched_priv' per-cpu variables to be moved up a bit, in csched2_switch_sched(). Signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx> --- Cc: George Dunlap <george.dunlap@xxxxxxxxxx> Cc: Anshul Makkar <anshulmakkar@xxxxxxxxx> Cc: Julien Grall <julien.grall@xxxxxxx> --- Release-wise, Julien, considering that: - we're not even freezed yet, - IAC, this is a bugfix, I'd say it should go in. --- xen/common/sched_credit2.c | 133 +++++++++++++++++++++++++++++--------------- 1 file changed, 89 insertions(+), 44 deletions(-) diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c index 18f39ca..4a08a9f 100644 --- a/xen/common/sched_credit2.c +++ b/xen/common/sched_credit2.c @@ -504,9 +504,11 @@ struct csched2_private { * Physical CPU * * The only per-pCPU information we need to maintain is of which runqueue - * each CPU is part of. + * each CPU is part of (-1 to mean 'to none'). */ -static DEFINE_PER_CPU(int, runq_map); +struct csched2_pcpu { + int runq_id; +}; /* * Virtual CPU @@ -565,6 +567,11 @@ static inline struct csched2_private *csched2_priv(const struct scheduler *ops) return ops->sched_data; } +static inline struct csched2_pcpu *csched2_pcpu(unsigned int cpu) +{ + return per_cpu(schedule_data, cpu).sched_priv; +} + static inline struct csched2_vcpu *csched2_vcpu(const struct vcpu *v) { return v->sched_priv; @@ -578,7 +585,7 @@ static inline struct csched2_dom *csched2_dom(const struct domain *d) /* CPU to runq_id macro */ static inline int c2r(unsigned int cpu) { - return per_cpu(runq_map, cpu); + return csched2_pcpu(cpu)->runq_id; } /* CPU to runqueue struct macro */ @@ -3769,31 +3776,33 @@ csched2_dump(const struct scheduler *ops) #undef cpustr } -/* Returns the ID of the runqueue the cpu is assigned to. */ -static unsigned -init_pdata(struct csched2_private *prv, unsigned int cpu) +static void +init_pdata(struct csched2_private *prv, + struct csched2_pcpu *spc, + unsigned int cpu) { - unsigned rqi; struct csched2_runqueue_data *rqd; ASSERT(rw_is_write_locked(&prv->lock)); + /* The CPU must not have been initialized already, in this scheduler. */ ASSERT(!cpumask_test_cpu(cpu, &prv->initialized)); + /* The per-CPU data needs to be allocated, but STILL uninitialized. */ + ASSERT(spc && spc->runq_id == -1); + /* alloc_pdata must have been called already. */ + ASSERT(per_cpu(schedule_data, cpu).sched_priv == spc); /* Figure out which runqueue to put it in */ - rqi = cpu_to_runqueue(prv, cpu); + spc->runq_id = cpu_to_runqueue(prv, cpu); - rqd = prv->rqd + rqi; + rqd = prv->rqd + spc->runq_id; - printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, rqi); - if ( ! cpumask_test_cpu(rqi, &prv->active_queues) ) + printk(XENLOG_INFO "Adding cpu %d to runqueue %d\n", cpu, spc->runq_id); + if ( !cpumask_test_cpu(spc->runq_id, &prv->active_queues) ) { printk(XENLOG_INFO " First cpu on runqueue, activating\n"); - activate_runqueue(prv, rqi); + activate_runqueue(prv, spc->runq_id); } - /* Set the runqueue map */ - per_cpu(runq_map, cpu) = rqi; - __cpumask_set_cpu(cpu, &rqd->idle); __cpumask_set_cpu(cpu, &rqd->active); __cpumask_set_cpu(cpu, &prv->initialized); @@ -3801,8 +3810,21 @@ init_pdata(struct csched2_private *prv, unsigned int cpu) if ( cpumask_weight(&rqd->active) == 1 ) rqd->pick_bias = cpu; +} - return rqi; +static void * +csched2_alloc_pdata(const struct scheduler *ops, int cpu) +{ + struct csched2_pcpu *spc; + + spc = xzalloc(struct csched2_pcpu); + if ( spc == NULL ) + return ERR_PTR(-ENOMEM); + + /* Initial situation: not part of any runqueue. */ + spc->runq_id = -1; + + return spc; } static void @@ -3811,20 +3833,14 @@ csched2_init_pdata(const struct scheduler *ops, void *pdata, int cpu) struct csched2_private *prv = csched2_priv(ops); spinlock_t *old_lock; unsigned long flags; - unsigned rqi; - - /* - * pdata contains what alloc_pdata returned. But since we don't (need to) - * implement alloc_pdata, either that's NULL, or something is very wrong! - */ - ASSERT(!pdata); write_lock_irqsave(&prv->lock, flags); old_lock = pcpu_schedule_lock(cpu); - rqi = init_pdata(prv, cpu); + init_pdata(prv, pdata, cpu); + /* Move the scheduler lock to the new runq lock. */ - per_cpu(schedule_data, cpu).schedule_lock = &prv->rqd[rqi].lock; + per_cpu(schedule_data, cpu).schedule_lock = &c2rqd(ops, cpu)->lock; /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */ spin_unlock(old_lock); @@ -3838,9 +3854,8 @@ csched2_switch_sched(struct scheduler *new_ops, unsigned int cpu, { struct csched2_private *prv = csched2_priv(new_ops); struct csched2_vcpu *svc = vdata; - unsigned rqi; - ASSERT(!pdata && svc && is_idle_vcpu(svc->vcpu)); + ASSERT(pdata && svc && is_idle_vcpu(svc->vcpu)); /* * We own one runqueue lock already (from schedule_cpu_switch()). This @@ -3855,7 +3870,10 @@ csched2_switch_sched(struct scheduler *new_ops, unsigned int cpu, idle_vcpu[cpu]->sched_priv = vdata; - rqi = init_pdata(prv, cpu); + per_cpu(scheduler, cpu) = new_ops; + per_cpu(schedule_data, cpu).sched_priv = pdata; + + init_pdata(prv, pdata, cpu); /* * Now that we know what runqueue we'll go in, double check what's said @@ -3863,10 +3881,7 @@ csched2_switch_sched(struct scheduler *new_ops, unsigned int cpu, * this scheduler, and so it's safe to have taken it /before/ our * private global lock. */ - ASSERT(per_cpu(schedule_data, cpu).schedule_lock != &prv->rqd[rqi].lock); - - per_cpu(scheduler, cpu) = new_ops; - per_cpu(schedule_data, cpu).sched_priv = NULL; /* no pdata */ + ASSERT(per_cpu(schedule_data, cpu).schedule_lock != &c2rqd(new_ops, cpu)->lock); /* * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact, @@ -3874,7 +3889,7 @@ csched2_switch_sched(struct scheduler *new_ops, unsigned int cpu, * taking it, find all the initializations we've done above in place. */ smp_mb(); - per_cpu(schedule_data, cpu).schedule_lock = &prv->rqd[rqi].lock; + per_cpu(schedule_data, cpu).schedule_lock = &c2rqd(new_ops, cpu)->lock; write_unlock(&prv->lock); } @@ -3885,25 +3900,27 @@ csched2_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu) unsigned long flags; struct csched2_private *prv = csched2_priv(ops); struct csched2_runqueue_data *rqd; - int rqi; + struct csched2_pcpu *spc = pcpu; write_lock_irqsave(&prv->lock, flags); + /* Both alloc_pdata and init_pdata must have been called on this CPU. */ + ASSERT(spc && spc->runq_id != -1 && + cpumask_test_cpu(cpu, &prv->initialized)); + /* - * alloc_pdata is not implemented, so pcpu must be NULL. On the other - * hand, init_pdata must have been called for this pCPU. + * NB. Since per_cpu(schedule_data,cpu).sched_priv may have been set to + * something that is different than pcpu already, we **can't** use the + * csched2_pcpu(), c2r() and c2rqd() helpers in here! */ - ASSERT(!pcpu && cpumask_test_cpu(cpu, &prv->initialized)); - - /* Find the old runqueue and remove this cpu from it */ - rqi = per_cpu(runq_map, cpu); - rqd = prv->rqd + rqi; + /* Find the old runqueue and remove this cpu from it */ + rqd = prv->rqd + spc->runq_id; /* No need to save IRQs here, they're already disabled */ spin_lock(&rqd->lock); - printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, rqi); + printk(XENLOG_INFO "Removing cpu %d from runqueue %d\n", cpu, spc->runq_id); __cpumask_clear_cpu(cpu, &rqd->idle); __cpumask_clear_cpu(cpu, &rqd->smt_idle); @@ -3912,12 +3929,12 @@ csched2_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu) if ( cpumask_empty(&rqd->active) ) { printk(XENLOG_INFO " No cpus left on runqueue, disabling\n"); - deactivate_runqueue(prv, rqi); + deactivate_runqueue(prv, spc->runq_id); } else if ( rqd->pick_bias == cpu ) rqd->pick_bias = cpumask_first(&rqd->active); - per_cpu(runq_map, cpu) = -1; + spc->runq_id = -1; spin_unlock(&rqd->lock); @@ -3928,6 +3945,32 @@ csched2_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu) return; } +static void +csched2_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) +{ + struct csched2_private *prv = csched2_priv(ops); + + /* + * We want to be sure that either init_pdata has never been called, for + * this CPU in this scheduler, or that deinit_pdata has been called on + * it already. + */ + ASSERT(!cpumask_test_cpu(cpu, &prv->initialized)); + + /* + * pcpu can be NULL, in case we are coming from CPU_UP_CANCELLED + * (because bringing up the CPU failed). And if it is not, it must, + * for the same reasons explained in the above comment, be invalid. + */ + if ( pcpu != NULL ) + { + struct csched2_pcpu *spc = pcpu; + + ASSERT(spc->runq_id == -1); + xfree(spc); + } +} + static int csched2_init(struct scheduler *ops) { @@ -4045,8 +4088,10 @@ static const struct scheduler sched_credit2_def = { .deinit = csched2_deinit, .alloc_vdata = csched2_alloc_vdata, .free_vdata = csched2_free_vdata, + .alloc_pdata = csched2_alloc_pdata, .init_pdata = csched2_init_pdata, .deinit_pdata = csched2_deinit_pdata, + .free_pdata = csched2_free_pdata, .switch_sched = csched2_switch_sched, .alloc_domdata = csched2_alloc_domdata, .free_domdata = csched2_free_domdata, _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx https://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.