Xen project Mailing List

Re: [Xen-devel] [PATCH v3 6/7] xen: credit1: increase efficiency and scalability of load balancing.

To: Dario Faggioli <dario.faggioli@xxxxxxxxxx>

From: George Dunlap <george.dunlap@xxxxxxxxxx>

Date: Fri, 7 Apr 2017 18:15:37 +0100

Cc: xen-devel <xen-devel@xxxxxxxxxxxxxxxxxxxx>, Anshul Makkar <anshul.makkar@xxxxxxxxxx>, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>

Delivery-date: Fri, 07 Apr 2017 17:15:47 +0000

List-id: Xen developer discussion <xen-devel.lists.xen.org>

On Fri, Apr 7, 2017 at 5:57 PM, Dario Faggioli <dario.faggioli@xxxxxxxxxx> wrote: > During load balancing, we check the non idle pCPUs to > see if they have runnable but not running vCPUs that > can be stolen by and set to run on currently idle pCPUs. > > If a pCPU has only one running (or runnable) vCPU, > though, we don't want to steal it from there, and > it's therefore pointless bothering with it > (especially considering that bothering means trying > to take its runqueue lock!). > > On large systems, when load is only slightly higher > than the number of pCPUs (i.e., there are just a few > more active vCPUs than the number of the pCPUs), this > may mean that: > - we go through all the pCPUs, > - for each one, we (try to) take its runqueue locks, > - we figure out there's actually nothing to be stolen! > > To mitigate this, we introduce a counter for the number > of runnable vCPUs on each pCPU. In fact, unless there > re least 2 runnable vCPUs --typically, one running, > and the others in the runqueue-- it does not make sense > to try stealing anything. > > signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx> Reviewed-by: George Dunlap <george.dunlap@xxxxxxxxxx> > --- > Cc: George Dunlap <george.dunlap@xxxxxxxxxx> > Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> > Cc: Anshul Makkar <anshul.makkar@xxxxxxxxxx> > --- > Changes from v2: > * do as much as nr-runnable accounting inside runqueue remove and runqueue > insert helpers. Add comments for the instances that must live outside of > them. > > Changes from v1: > * don't count the idle vCPU as runnable. This is just cosmetic and not at all > a > logic or functionl change wrt v1; > * don't count inside of __runq_remove() or __runq_insert(), but provide > specific counting functions, and call them when appropriate. This is > necessary > to avoid spurious overloaded state to be reported, basically *all* the time > that a pCPU goes through the scheduler (due to the fact that the scheduler > calls __runq_insert() on the current vCPU); > * get rid of the overloaded cpumask and only use the counter. I actually did > like the cpumask solution better, but for this purpose, it was overkill. > --- > xen/common/sched_credit.c | 97 > ++++++++++++++++++++++++++++++++++++++++----- > 1 file changed, 87 insertions(+), 10 deletions(-) > > diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c > index 59b87f7..a0ad167 100644 > --- a/xen/common/sched_credit.c > +++ b/xen/common/sched_credit.c > @@ -172,6 +172,7 @@ struct csched_pcpu { > struct timer ticker; > unsigned int tick; > unsigned int idle_bias; > + unsigned int nr_runnable; > }; > > /* > @@ -262,9 +263,26 @@ static inline bool_t is_runq_idle(unsigned int cpu) > } > > static inline void > +inc_nr_runnable(unsigned int cpu) > +{ > + ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock)); > + CSCHED_PCPU(cpu)->nr_runnable++; > + > +} > + > +static inline void > +dec_nr_runnable(unsigned int cpu) > +{ > + ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock)); > + CSCHED_PCPU(cpu)->nr_runnable--; > + ASSERT(CSCHED_PCPU(cpu)->nr_runnable >= 0); > +} > + > +static inline void > __runq_insert(struct csched_vcpu *svc) > { > - const struct list_head * const runq = RUNQ(svc->vcpu->processor); > + unsigned int cpu = svc->vcpu->processor; > + const struct list_head * const runq = RUNQ(cpu); > struct list_head *iter; > > BUG_ON( __vcpu_on_runq(svc) ); > @@ -292,12 +310,25 @@ __runq_insert(struct csched_vcpu *svc) > } > > static inline void > +runq_insert(struct csched_vcpu *svc) > +{ > + __runq_insert(svc); > + inc_nr_runnable(svc->vcpu->processor); > +} > + > +static inline void > __runq_remove(struct csched_vcpu *svc) > { > BUG_ON( !__vcpu_on_runq(svc) ); > list_del_init(&svc->runq_elem); > } > > +static inline void > +runq_remove(struct csched_vcpu *svc) > +{ > + dec_nr_runnable(svc->vcpu->processor); > + __runq_remove(svc); > +} > > #define for_each_csched_balance_step(step) \ > for ( (step) = 0; (step) <= CSCHED_BALANCE_HARD_AFFINITY; (step)++ ) > @@ -601,6 +632,7 @@ init_pdata(struct csched_private *prv, struct csched_pcpu > *spc, int cpu) > /* Start off idling... */ > BUG_ON(!is_idle_vcpu(curr_on_cpu(cpu))); > cpumask_set_cpu(cpu, prv->idlers); > + spc->nr_runnable = 0; > } > > static void > @@ -1052,7 +1084,7 @@ csched_vcpu_insert(const struct scheduler *ops, struct > vcpu *vc) > lock = vcpu_schedule_lock_irq(vc); > > if ( !__vcpu_on_runq(svc) && vcpu_runnable(vc) && !vc->is_running ) > - __runq_insert(svc); > + runq_insert(svc); > > vcpu_schedule_unlock_irq(lock, vc); > > @@ -1117,7 +1149,7 @@ csched_vcpu_sleep(const struct scheduler *ops, struct > vcpu *vc) > cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); > } > else if ( __vcpu_on_runq(svc) ) > - __runq_remove(svc); > + runq_remove(svc); > } > > static void > @@ -1177,7 +1209,7 @@ csched_vcpu_wake(const struct scheduler *ops, struct > vcpu *vc) > } > > /* Put the VCPU on the runq and tickle CPUs */ > - __runq_insert(svc); > + runq_insert(svc); > __runq_tickle(svc); > } > > @@ -1679,8 +1711,14 @@ csched_runq_steal(int peer_cpu, int cpu, int pri, int > balance_step) > SCHED_VCPU_STAT_CRANK(speer, migrate_q); > SCHED_STAT_CRANK(migrate_queued); > WARN_ON(vc->is_urgent); > - __runq_remove(speer); > + runq_remove(speer); > vc->processor = cpu; > + /* > + * speer will start executing directly on cpu, without having to > + * go through runq_insert(). So we must update the runnable count > + * for cpu here. > + */ > + inc_nr_runnable(cpu); > return speer; > } > } > @@ -1736,7 +1774,7 @@ csched_load_balance(struct csched_private *prv, int cpu, > peer_node = node; > do > { > - /* Find out what the !idle are in this node */ > + /* Select the pCPUs in this node that have work we can steal. */ > cpumask_andnot(&workers, online, prv->idlers); > cpumask_and(&workers, &workers, &node_to_cpumask(peer_node)); > __cpumask_clear_cpu(cpu, &workers); > @@ -1746,6 +1784,40 @@ csched_load_balance(struct csched_private *prv, int > cpu, > goto next_node; > do > { > + spinlock_t *lock; > + > + /* > + * If there is only one runnable vCPU on peer_cpu, it means > + * there's no one to be stolen in its runqueue, so skip it. > + * > + * Checking this without holding the lock is racy... But > that's > + * the whole point of this optimization! > + * > + * In more details: > + * - if we race with dec_nr_runnable(), we may try to take > the > + * lock and call csched_runq_steal() for no reason. This is > + * not a functional issue, and should be infrequent enough. > + * And we can avoid that by re-checking nr_runnable after > + * having grabbed the lock, if we want; > + * - if we race with inc_nr_runnable(), we skip a pCPU that > may > + * have runnable vCPUs in its runqueue, but that's not a > + * problem because: > + * + if racing with csched_vcpu_insert() or > csched_vcpu_wake(), > + * __runq_tickle() will be called afterwords, so the vCPU > + * won't get stuck in the runqueue for too long; > + * + if racing with csched_runq_steal(), it may be that a > + * vCPU that we could have picked up, stays in a runqueue > + * until someone else tries to steal it again. But this > is > + * no worse than what can happen already (without this > + * optimization), it the pCPU would schedule right after > we > + * have taken the lock, and hence block on it. > + */ > + if ( CSCHED_PCPU(peer_cpu)->nr_runnable <= 1 ) > + { > + TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skipp'n */ > 0); > + goto next_cpu; > + } > + > /* > * Get ahold of the scheduler lock for this peer CPU. > * > @@ -1753,14 +1825,13 @@ csched_load_balance(struct csched_private *prv, int > cpu, > * could cause a deadlock if the peer CPU is also load > * balancing and trying to lock this CPU. > */ > - spinlock_t *lock = pcpu_schedule_trylock(peer_cpu); > + lock = pcpu_schedule_trylock(peer_cpu); > SCHED_STAT_CRANK(steal_trylock); > if ( !lock ) > { > SCHED_STAT_CRANK(steal_trylock_failed); > TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* skip */ 0); > - peer_cpu = cpumask_cycle(peer_cpu, &workers); > - continue; > + goto next_cpu; > } > > TRACE_2D(TRC_CSCHED_STEAL_CHECK, peer_cpu, /* checked */ 1); > @@ -1777,6 +1848,7 @@ csched_load_balance(struct csched_private *prv, int cpu, > return speer; > } > > + next_cpu: > peer_cpu = cpumask_cycle(peer_cpu, &workers); > > } while( peer_cpu != cpumask_first(&workers) ); > @@ -1907,7 +1979,11 @@ csched_schedule( > if ( vcpu_runnable(current) ) > __runq_insert(scurr); > else > + { > BUG_ON( is_idle_vcpu(current) || list_empty(runq) ); > + /* Current has blocked. Update the runnable counter for this cpu. */ > + dec_nr_runnable(cpu); > + } > > snext = __runq_elem(runq->next); > ret.migrated = 0; > @@ -2024,7 +2100,8 @@ csched_dump_pcpu(const struct scheduler *ops, int cpu) > runq = &spc->runq; > > cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, > cpu)); > - printk("CPU[%02d] sort=%d, sibling=%s, ", cpu, spc->runq_sort_last, > cpustr); > + printk("CPU[%02d] nr_run=%d, sort=%d, sibling=%s, ", > + cpu, spc->nr_runnable, spc->runq_sort_last, cpustr); > cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu)); > printk("core=%s\n", cpustr); > > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@xxxxxxxxxxxxx > https://lists.xen.org/xen-devel _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx https://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.