|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH v2] sched: credit2: respect per-vcpu hard affinity
From: "Justin T. Weaver" <jtweaver@xxxxxxxxxx>
by making sure that vcpus only run on the pcpu(s) they are allowed to
run on based on their hard affinity cpu masks.
Signed-off-by: Justin T. Weaver <jtweaver@xxxxxxxxxx>
---
Changes in v2:
* Added dynamically allocated cpu masks to avoid putting them on the stack;
replaced temp masks from v1 throughout
* Added helper function for code suggested in v1 review and called it in two
locations in function choose_cpu
* Removed v1 change to comment in the beginning of choose_cpu
* Replaced two instances of cpumask_and/cpumask_empty with cpumask_intersects
* Removed v1 re-work of code in function migrate; only change in migrate in
v2 is the assignment of a valid pcpu from the destination run queue to
vc->processor
* In function csched2_vcpu_migrate: removed change from v1 that called
function migrate even if cur and dest run queues were the same in order
to get a runq_tickle call; added processor assignment to new_cpu to fix
the real underlying issue which was the vcpu not getting a call to
sched_move_irqs
* Removed the looping added in v1 in function balance_load; may be added back
later because it would help to have balance_load be more aware of hard
affinity, but adding it does not affect credit2's current inability to
respect hard affinity.
* Removed coding style fix in function balance_load
* Improved comment in function runq_candidate
---
xen/common/sched_credit2.c | 122 +++++++++++++++++++++++++++++++++++++++-----
1 file changed, 108 insertions(+), 14 deletions(-)
diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
index cf53770..de8fb5a 100644
--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -194,6 +194,12 @@ int opt_overload_balance_tolerance=-3;
integer_param("credit2_balance_over", opt_overload_balance_tolerance);
/*
+ * Use this to avoid having too many cpumask_t structs on the stack
+ */
+static cpumask_t **cpumask = NULL;
+#define csched2_cpumask cpumask[smp_processor_id()]
+
+/*
* Per-runqueue data
*/
struct csched2_runqueue_data {
@@ -268,6 +274,23 @@ struct csched2_dom {
uint16_t nr_vcpus;
};
+/*
+ * When a hard affinity change occurs, we may not be able to check some or
+ * all of the other run queues for a valid new processor for the given vcpu.
+ * Return svc's current pcpu if valid, otherwise return a safe pcpu.
+ */
+static int get_safe_pcpu(struct csched2_vcpu *svc)
+{
+ cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
&svc->rqd->active);
+ if ( unlikely(cpumask_empty(csched2_cpumask)) )
+ cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
+ cpupool_online_cpumask(svc->vcpu->domain->cpupool));
+
+ if ( cpumask_test_cpu(svc->vcpu->processor, csched2_cpumask) )
+ return svc->vcpu->processor;
+ else
+ return cpumask_any(csched2_cpumask);
+}
/*
* Time-to-credit, credit-to-time.
@@ -501,8 +524,9 @@ runq_tickle(const struct scheduler *ops, unsigned int cpu,
struct csched2_vcpu *
goto tickle;
}
- /* Get a mask of idle, but not tickled */
+ /* Get a mask of idle, but not tickled, that new is allowed to run on. */
cpumask_andnot(&mask, &rqd->idle, &rqd->tickled);
+ cpumask_and(&mask, &mask, new->vcpu->cpu_hard_affinity);
/* If it's not empty, choose one */
i = cpumask_cycle(cpu, &mask);
@@ -513,9 +537,11 @@ runq_tickle(const struct scheduler *ops, unsigned int cpu,
struct csched2_vcpu *
}
/* Otherwise, look for the non-idle cpu with the lowest credit,
- * skipping cpus which have been tickled but not scheduled yet */
+ * skipping cpus which have been tickled but not scheduled yet,
+ * that new is allowed to run on. */
cpumask_andnot(&mask, &rqd->active, &rqd->idle);
cpumask_andnot(&mask, &mask, &rqd->tickled);
+ cpumask_and(&mask, &mask, new->vcpu->cpu_hard_affinity);
for_each_cpu(i, &mask)
{
@@ -1063,9 +1089,8 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
d2printk("%pv -\n", svc->vcpu);
clear_bit(__CSFLAG_runq_migrate_request, &svc->flags);
}
- /* Leave it where it is for now. When we actually pay attention
- * to affinity we'll have to figure something out... */
- return vc->processor;
+
+ return get_safe_pcpu(svc);
}
/* First check to see if we're here because someone else suggested a place
@@ -1081,13 +1106,17 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
else
{
d2printk("%pv +\n", svc->vcpu);
- new_cpu = cpumask_cycle(vc->processor, &svc->migrate_rqd->active);
- goto out_up;
+ cpumask_and(csched2_cpumask, vc->cpu_hard_affinity,
+ &svc->migrate_rqd->active);
+ if ( !cpumask_empty(csched2_cpumask) )
+ {
+ new_cpu = cpumask_any(csched2_cpumask);
+ goto out_up;
+ }
+ /* Fall-through to normal cpu pick */
}
}
- /* FIXME: Pay attention to cpu affinity */
-
min_avgload = MAX_LOAD;
/* Find the runqueue with the lowest instantaneous load */
@@ -1099,17 +1128,24 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
rqd = prv->rqd + i;
/* If checking a different runqueue, grab the lock,
- * read the avg, and then release the lock.
+ * check hard affinity, read the avg, and then release the lock.
*
* If on our own runqueue, don't grab or release the lock;
* but subtract our own load from the runqueue load to simulate
* impartiality */
if ( rqd == svc->rqd )
{
+ if ( !cpumask_intersects(vc->cpu_hard_affinity, &rqd->active) )
+ continue;
rqd_avgload = rqd->b_avgload - svc->avgload;
}
else if ( spin_trylock(&rqd->lock) )
{
+ if ( !cpumask_intersects(vc->cpu_hard_affinity, &rqd->active) )
+ {
+ spin_unlock(&rqd->lock);
+ continue;
+ }
rqd_avgload = rqd->b_avgload;
spin_unlock(&rqd->lock);
}
@@ -1123,12 +1159,16 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
}
}
- /* We didn't find anyone (most likely because of spinlock contention);
leave it where it is */
if ( min_rqi == -1 )
- new_cpu = vc->processor;
+ {
+ /* No runqs found (most likely because of spinlock contention). */
+ new_cpu = get_safe_pcpu(svc);
+ }
else
{
- new_cpu = cpumask_cycle(vc->processor, &prv->rqd[min_rqi].active);
+ cpumask_and(csched2_cpumask, vc->cpu_hard_affinity,
+ &prv->rqd[min_rqi].active);
+ new_cpu = cpumask_any(csched2_cpumask);
BUG_ON(new_cpu >= nr_cpu_ids);
}
@@ -1207,7 +1247,12 @@ static void migrate(const struct scheduler *ops,
on_runq=1;
}
__runq_deassign(svc);
- svc->vcpu->processor = cpumask_any(&trqd->active);
+
+ cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
+ &trqd->active);
+ svc->vcpu->processor = cpumask_any(csched2_cpumask);
+ BUG_ON(svc->vcpu->processor >= nr_cpu_ids);
+
__runq_assign(svc, trqd);
if ( on_runq )
{
@@ -1330,6 +1375,12 @@ retry:
if ( test_bit(__CSFLAG_runq_migrate_request, &push_svc->flags) )
continue;
+ /* Skip if it can't run on the destination runq. */
+ cpumask_and(csched2_cpumask, push_svc->vcpu->cpu_hard_affinity,
+ &st.orqd->active);
+ if ( cpumask_empty(csched2_cpumask) )
+ continue;
+
list_for_each( pull_iter, &st.orqd->svc )
{
struct csched2_vcpu * pull_svc = list_entry(pull_iter, struct
csched2_vcpu, rqd_elem);
@@ -1343,6 +1394,12 @@ retry:
if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
continue;
+ /* Skip if it can't run on the destination runq. */
+ cpumask_and(csched2_cpumask, pull_svc->vcpu->cpu_hard_affinity,
+ &st.lrqd->active);
+ if ( cpumask_empty(csched2_cpumask) )
+ continue;
+
consider(&st, push_svc, pull_svc);
}
@@ -1360,6 +1417,12 @@ retry:
if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
continue;
+ /* Skip if it can't run on the destination runq. */
+ cpumask_and(csched2_cpumask, pull_svc->vcpu->cpu_hard_affinity,
+ &st.lrqd->active);
+ if ( cpumask_empty(csched2_cpumask) )
+ continue;
+
/* Consider pull only */
consider(&st, NULL, pull_svc);
}
@@ -1396,6 +1459,15 @@ csched2_vcpu_migrate(
/* Check if new_cpu is valid */
BUG_ON(!cpumask_test_cpu(new_cpu, &CSCHED2_PRIV(ops)->initialized));
+ BUG_ON(!cpumask_test_cpu(new_cpu, vc->cpu_hard_affinity));
+
+ /*
+ * Assign new_cpu to vc->processor here to get a call to sched_move_irqs
+ * in schedule.c in case there was a hard affinity change within the same
+ * run queue. vc will not be able to run in certain situations without
+ * this call.
+ */
+ vc->processor = new_cpu;
trqd = RQD(ops, new_cpu);
@@ -1610,6 +1682,10 @@ runq_candidate(struct csched2_runqueue_data *rqd,
{
struct csched2_vcpu * svc = list_entry(iter, struct csched2_vcpu,
runq_elem);
+ /* Only consider vcpus that are allowed to run on this processor. */
+ if ( !cpumask_test_cpu(cpu, svc->vcpu->cpu_hard_affinity) )
+ continue;
+
/* If this is on a different processor, don't pull it unless
* its credit is at least CSCHED2_MIGRATE_RESIST higher. */
if ( svc->vcpu->processor != cpu
@@ -1992,6 +2068,13 @@ csched2_alloc_pdata(const struct scheduler *ops, int cpu)
printk("%s: cpu %d not online yet, deferring initializatgion\n",
__func__, cpu);
+ /*
+ * For each new pcpu, allocate a cpumask_t for use throughout the
+ * scheduler to avoid putting any cpumask_t structs on the stack.
+ */
+ if ( !zalloc_cpumask_var(&cpumask[cpu]) )
+ return NULL;
+
return (void *)1;
}
@@ -2040,6 +2123,8 @@ csched2_free_pdata(const struct scheduler *ops, void
*pcpu, int cpu)
spin_unlock_irqrestore(&prv->lock, flags);
+ free_cpumask_var(cpumask[cpu]);
+
return;
}
@@ -2127,16 +2212,25 @@ csched2_init(struct scheduler *ops)
prv->load_window_shift = opt_load_window_shift;
+ cpumask = xzalloc_bytes(nr_cpu_ids * sizeof(cpumask_t *));
+ if ( cpumask == NULL )
+ return -ENOMEM;
+
return 0;
}
static void
csched2_deinit(const struct scheduler *ops)
{
+ int i;
struct csched2_private *prv;
prv = CSCHED2_PRIV(ops);
xfree(prv);
+
+ for ( i = 0; i < nr_cpu_ids; i++ )
+ free_cpumask_var(cpumask[i]);
+ xfree(cpumask);
}
--
1.7.10.4
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |