Xen project Mailing List

[Xen-devel] [PATCH 05/24] xen: credit2: make tickling more deterministic

From: Dario Faggioli <dario.faggioli@xxxxxxxxxx>

Date: Wed, 17 Aug 2016 19:18:08 +0200

Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Anshul Makkar <anshul.makkar@xxxxxxxxxx>, George Dunlap <george.dunlap@xxxxxxxxxx>, Jan Beulich <JBeulich@xxxxxxxx>

Delivery-date: Wed, 17 Aug 2016 17:18:14 +0000

List-id: Xen developer discussion <xen-devel.lists.xen.org>

Right now, the following scenario can occurr: - upon vcpu v wakeup, v itself is put in the runqueue, and pcpu X is tickled; - pcpu Y schedules (for whatever reason), sees v in the runqueue and picks it up. This may seem ok (or even a good thing), but it's not. In fact, if runq_tickle() decided X is where v should run, it did it for a reason (load distribution, SMT support, cache hotness, affinity, etc), and we really should try as hard as possible to stick to that. Of course, we can't be too strict, or we risk leaving vcpus in the runqueue while there is available CPU capacity. So, we only leave v in runqueue --for X to pick it up-- if we see that X has been tickled and has not scheduled yet, i.e., it will have a real chance of actually select and schedule v. If that is not the case, we schedule it on Y (or, at least, we consider that), as running somewhere non-ideal is better than not running at all. The commit also adds performance counters for each of the possible situations. Signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx> --- Cc: George Dunlap <george.dunlap@xxxxxxxxxx> Cc: Anshul Makkar <anshul.makkar@xxxxxxxxxx> Cc: Jan Beulich <JBeulich@xxxxxxxx> Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> --- xen/common/sched_credit2.c | 65 +++++++++++++++++++++++++++++++++++++++--- xen/include/xen/perfc_defn.h | 3 ++ 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c index 12dfd20..a3d7beb 100644 --- a/xen/common/sched_credit2.c +++ b/xen/common/sched_credit2.c @@ -54,6 +54,7 @@ #define TRC_CSCHED2_LOAD_CHECK TRC_SCHED_CLASS_EVT(CSCHED2, 16) #define TRC_CSCHED2_LOAD_BALANCE TRC_SCHED_CLASS_EVT(CSCHED2, 17) #define TRC_CSCHED2_PICKED_CPU TRC_SCHED_CLASS_EVT(CSCHED2, 19) +#define TRC_CSCHED2_RUNQ_CANDIDATE TRC_SCHED_CLASS_EVT(CSCHED2, 20) /* * WARNING: This is still in an experimental phase. Status and work can be found at the @@ -398,6 +399,7 @@ struct csched2_vcpu { int credit; s_time_t start_time; /* When we were scheduled (used for credit) */ unsigned flags; /* 16 bits doesn't seem to play well with clear_bit() */ + int tickled_cpu; /* cpu tickled for picking us up (-1 if none) */ /* Individual contribution to load */ s_time_t load_last_update; /* Last time average was updated */ @@ -1049,6 +1051,10 @@ runq_tickle(const struct scheduler *ops, struct csched2_vcpu *new, s_time_t now) __cpumask_set_cpu(ipid, &rqd->tickled); smt_idle_mask_clear(ipid, &rqd->smt_idle); cpu_raise_softirq(ipid, SCHEDULE_SOFTIRQ); + + if ( unlikely(new->tickled_cpu != -1) ) + SCHED_STAT_CRANK(tickled_cpu_overwritten); + new->tickled_cpu = ipid; } /* @@ -1266,6 +1272,7 @@ csched2_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd) ASSERT(svc->sdom != NULL); svc->credit = CSCHED2_CREDIT_INIT; svc->weight = svc->sdom->weight; + svc->tickled_cpu = -1; /* Starting load of 50% */ svc->avgload = 1ULL << (CSCHED2_PRIV(ops)->load_precision_shift - 1); svc->load_last_update = NOW() >> LOADAVG_GRANULARITY_SHIFT; @@ -1273,6 +1280,7 @@ csched2_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd) else { ASSERT(svc->sdom == NULL); + svc->tickled_cpu = svc->vcpu->vcpu_id; svc->credit = CSCHED2_IDLE_CREDIT; svc->weight = 0; } @@ -2233,7 +2241,8 @@ void __dump_execstate(void *unused); static struct csched2_vcpu * runq_candidate(struct csched2_runqueue_data *rqd, struct csched2_vcpu *scurr, - int cpu, s_time_t now) + int cpu, s_time_t now, + unsigned int *pos) { struct list_head *iter; struct csched2_vcpu *snext = NULL; @@ -2262,13 +2271,29 @@ runq_candidate(struct csched2_runqueue_data *rqd, /* Only consider vcpus that are allowed to run on this processor. */ if ( !cpumask_test_cpu(cpu, svc->vcpu->cpu_hard_affinity) ) + { + (*pos)++; continue; + } + + /* + * If a vcpu is meant to be picked up by another processor, and such + * processor has not scheduled yet, leave it in the runqueue for him. + */ + if ( svc->tickled_cpu != -1 && svc->tickled_cpu != cpu && + cpumask_test_cpu(svc->tickled_cpu, &rqd->tickled) ) + { + (*pos)++; + SCHED_STAT_CRANK(deferred_to_tickled_cpu); + continue; + } /* If this is on a different processor, don't pull it unless * its credit is at least CSCHED2_MIGRATE_RESIST higher. */ if ( svc->vcpu->processor != cpu && snext->credit + CSCHED2_MIGRATE_RESIST > svc->credit ) { + (*pos)++; SCHED_STAT_CRANK(migrate_resisted); continue; } @@ -2280,9 +2305,26 @@ runq_candidate(struct csched2_runqueue_data *rqd, /* In any case, if we got this far, break. */ break; + } + if ( unlikely(tb_init_done) ) + { + struct { + unsigned vcpu:16, dom:16; + unsigned tickled_cpu, position; + } d; + d.dom = snext->vcpu->domain->domain_id; + d.vcpu = snext->vcpu->vcpu_id; + d.tickled_cpu = snext->tickled_cpu; + d.position = *pos; + __trace_var(TRC_CSCHED2_RUNQ_CANDIDATE, 1, + sizeof(d), + (unsigned char *)&d); } + if ( unlikely(snext->tickled_cpu != -1 && snext->tickled_cpu != cpu) ) + SCHED_STAT_CRANK(tickled_cpu_overridden); + return snext; } @@ -2298,6 +2340,7 @@ csched2_schedule( struct csched2_runqueue_data *rqd; struct csched2_vcpu * const scurr = CSCHED2_VCPU(current); struct csched2_vcpu *snext = NULL; + unsigned int snext_pos = 0; struct task_slice ret; SCHED_STAT_CRANK(schedule); @@ -2347,7 +2390,7 @@ csched2_schedule( snext = CSCHED2_VCPU(idle_vcpu[cpu]); } else - snext=runq_candidate(rqd, scurr, cpu, now); + snext = runq_candidate(rqd, scurr, cpu, now, &snext_pos); /* If switching from a non-idle runnable vcpu, put it * back on the runqueue. */ @@ -2371,8 +2414,21 @@ csched2_schedule( __set_bit(__CSFLAG_scheduled, &snext->flags); } - /* Check for the reset condition */ - if ( snext->credit <= CSCHED2_CREDIT_RESET ) + /* + * The reset condition is "has a scheduler epoch come to an end?". + * The way this is enforced is checking whether the vcpu at the top + * of the runqueue has negative credits. This means the epochs have + * variable lenght, as in one epoch expores when: + * 1) the vcpu at the top of the runqueue has executed for + * around 10 ms (with default parameters); + * 2) no other vcpu with higher credits wants to run. + * + * Here, where we want to check for reset, we need to make sure the + * proper vcpu is being used. In fact, runqueue_candidate() may have + * not returned the first vcpu in the runqueue, for various reasons + * (e.g., affinity). Only trigger a reset when it does. + */ + if ( snext_pos == 0 && snext->credit <= CSCHED2_CREDIT_RESET ) { reset_credit(ops, cpu, now, snext); balance_load(ops, cpu, now); @@ -2386,6 +2442,7 @@ csched2_schedule( } snext->start_time = now; + snext->tickled_cpu = -1; /* Safe because lock for old processor is held */ if ( snext->vcpu->processor != cpu ) diff --git a/xen/include/xen/perfc_defn.h b/xen/include/xen/perfc_defn.h index a336c71..4a835b8 100644 --- a/xen/include/xen/perfc_defn.h +++ b/xen/include/xen/perfc_defn.h @@ -66,6 +66,9 @@ PERFCOUNTER(runtime_max_timer, "csched2: runtime_max_timer") PERFCOUNTER(migrated, "csched2: migrated") PERFCOUNTER(migrate_resisted, "csched2: migrate_resisted") PERFCOUNTER(credit_reset, "csched2: credit_reset") +PERFCOUNTER(deferred_to_tickled_cpu,"csched2: deferred_to_tickled_cpu") +PERFCOUNTER(tickled_cpu_overwritten,"csched2: tickled_cpu_overwritten") +PERFCOUNTER(tickled_cpu_overridden, "csched2: tickled_cpu_overridden") PERFCOUNTER(need_flush_tlb_flush, "PG_need_flush tlb flushes") _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx https://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.