|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [xen staging] xen: credit2: limit the max number of CPUs in a runqueue
commit 8e2aa76dc1670e82eaa15683353853bc66bf54fc
Author: Dario Faggioli <dfaggioli@xxxxxxxx>
AuthorDate: Thu May 28 23:29:44 2020 +0200
Commit: George Dunlap <george.dunlap@xxxxxxxxxx>
CommitDate: Fri May 29 18:53:54 2020 +0100
xen: credit2: limit the max number of CPUs in a runqueue
In Credit2 CPUs (can) share runqueues, depending on the topology. For
instance, with per-socket runqueues (the default) all the CPUs that are
part of the same socket share a runqueue.
On platform with a huge number of CPUs per socket, that could be a
problem. An example is AMD EPYC2 servers, where we can have up to 128
CPUs in a socket.
It is of course possible to define other, still topology-based, runqueue
arrangements (e.g., per-LLC, per-DIE, etc). But that may still result in
runqueues with too many CPUs on other/future platforms. For instance, a
system with 96 CPUs and 2 NUMA nodes will end up having 48 CPUs per
runqueue. Not as bad, but still a lot!
Therefore, let's set a limit to the max number of CPUs that can share a
Credit2 runqueue. The actual value is configurable (at boot time), the
default being 16. If, for instance, there are more than 16 CPUs in a
socket, they'll be split among two (or more) runqueues.
Note: with core scheduling enabled, this parameter sets the max number
of *scheduling resources* that can share a runqueue. Therefore, with
granularity set to core (and assumint 2 threads per core), we will have
at most 16 cores per runqueue, which corresponds to 32 threads. But that
is fine, considering how core scheduling works.
Signed-off-by: Dario Faggioli <dfaggioli@xxxxxxxx>
Reviewed-by: Juergen Gross <jgross@xxxxxxxx>
---
docs/misc/xen-command-line.pandoc | 14 ++++
xen/common/sched/credit2.c | 144 ++++++++++++++++++++++++++++++++++++--
xen/include/asm-arm/cpufeature.h | 5 ++
xen/include/asm-x86/processor.h | 5 ++
4 files changed, 162 insertions(+), 6 deletions(-)
diff --git a/docs/misc/xen-command-line.pandoc
b/docs/misc/xen-command-line.pandoc
index e16bb90184..1787f2c8fb 100644
--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -1840,6 +1840,20 @@ with read and write permissions.
Choose the default scheduler.
+### sched_credit2_max_cpus_runqueue
+> `= <integer>`
+
+> Default: `16`
+
+Defines how many CPUs will be put, at most, in each Credit2 runqueue.
+
+Runqueues are still arranged according to the host topology (and following
+what indicated by the 'credit2_runqueue' parameter). But we also have a cap
+to the number of CPUs that share each runqueues.
+
+A value that is a submultiple of the number of online CPUs is recommended,
+as that would likely produce a perfectly balanced runqueue configuration.
+
### sched_credit2_migrate_resist
> `= <integer>`
diff --git a/xen/common/sched/credit2.c b/xen/common/sched/credit2.c
index 8a4f28b9f5..f4d3f8ae6b 100644
--- a/xen/common/sched/credit2.c
+++ b/xen/common/sched/credit2.c
@@ -25,6 +25,7 @@
#include <xen/trace.h>
#include <xen/cpu.h>
#include <xen/keyhandler.h>
+#include <asm/processor.h>
#include "private.h"
@@ -471,6 +472,22 @@ static int __init parse_credit2_runqueue(const char *s)
}
custom_param("credit2_runqueue", parse_credit2_runqueue);
+/*
+ * How many CPUs will be put, at most, in each runqueue.
+ *
+ * Runqueues are still arranged according to the host topology (and according
+ * to the value of the 'credit2_runqueue' parameter). But we also have a cap
+ * to the number of CPUs that share runqueues.
+ *
+ * This should be considered an upper limit. In fact, we also try to balance
+ * the number of CPUs in each runqueue. And, when doing that, it is possible
+ * that fewer CPUs than what this parameters mandates will actually be put
+ * in each runqueue.
+ */
+#define MAX_CPUS_RUNQ 16
+static unsigned int __read_mostly opt_max_cpus_runqueue = MAX_CPUS_RUNQ;
+integer_param("sched_credit2_max_cpus_runqueue", opt_max_cpus_runqueue);
+
/*
* Per-runqueue data
*/
@@ -852,18 +869,83 @@ cpu_runqueue_match(const struct csched2_runqueue_data
*rqd, unsigned int cpu)
(opt_runqueue == OPT_RUNQUEUE_NODE && same_node(peer_cpu, cpu));
}
+/*
+ * Additional checks, to avoid separating siblings in different runqueues.
+ * This deals with both Intel's HTs and AMD's CUs. An arch that does not have
+ * any similar concept will just have cpu_nr_siblings() always return 1, and
+ * setup the cpu_sibling_mask-s acordingly (as currently does ARM), and things
+ * will just work as well.
+ */
+static bool
+cpu_runqueue_siblings_match(const struct csched2_runqueue_data *rqd,
+ unsigned int cpu, unsigned int max_cpus_runq)
+{
+ unsigned int nr_sibls = cpu_nr_siblings(cpu);
+ unsigned int rcpu, tot_sibls = 0;
+
+ /*
+ * If we put the CPU in this runqueue, we must be sure that there will
+ * be enough room for accepting its sibling(s) as well.
+ */
+ cpumask_clear(cpumask_scratch_cpu(cpu));
+ for_each_cpu ( rcpu, &rqd->active )
+ {
+ ASSERT(rcpu != cpu);
+ if ( !cpumask_intersects(per_cpu(cpu_sibling_mask, rcpu),
cpumask_scratch_cpu(cpu)) )
+ {
+ /*
+ * For each CPU already in the runqueue, account for it and for
+ * its sibling(s), independently from whether they are in the
+ * runqueue or not. Of course, we do this only once, for each CPU
+ * that is already inside the runqueue and all its siblings!
+ *
+ * This way, even if there are CPUs in the runqueue with siblings
+ * in a different cpupools, we still count all of them here.
+ * The reason for this is that, if at some future point we will
+ * move those sibling CPUs to this cpupool, we want them to land
+ * in this runqueue. Hence we must be sure to leave space for them.
+ */
+ cpumask_or(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
+ per_cpu(cpu_sibling_mask, rcpu));
+ tot_sibls += cpu_nr_siblings(rcpu);
+ }
+ }
+ /*
+ * We know that neither the CPU, nor any of its sibling are here,
+ * or we wouldn't even have entered the function.
+ */
+ ASSERT(!cpumask_intersects(cpumask_scratch_cpu(cpu),
+ per_cpu(cpu_sibling_mask, cpu)));
+
+ /* Try adding CPU and its sibling(s) to the count and check... */
+ return tot_sibls + nr_sibls <= max_cpus_runq;
+}
+
static struct csched2_runqueue_data *
-cpu_add_to_runqueue(struct csched2_private *prv, unsigned int cpu)
+cpu_add_to_runqueue(const struct scheduler *ops, unsigned int cpu)
{
+ struct csched2_private *prv = csched2_priv(ops);
struct csched2_runqueue_data *rqd, *rqd_new;
+ struct csched2_runqueue_data *rqd_valid = NULL;
struct list_head *rqd_ins;
unsigned long flags;
int rqi = 0;
- bool rqi_unused = false, rqd_valid = false;
+ unsigned int min_rqs, max_cpus_runq;
+ bool rqi_unused = false;
/* Prealloc in case we need it - not allowed with interrupts off. */
rqd_new = xzalloc(struct csched2_runqueue_data);
+ /*
+ * While respecting the limit of not having more than the max number of
+ * CPUs per runqueue, let's also try to "spread" the CPU, as evenly as
+ * possible, among the runqueues. For doing that, we need to know upfront
+ * how many CPUs we have, so let's use the number of CPUs that are online
+ * for that.
+ */
+ min_rqs = ((num_online_cpus() - 1) / opt_max_cpus_runqueue) + 1;
+ max_cpus_runq = num_online_cpus() / min_rqs;
+
write_lock_irqsave(&prv->lock, flags);
rqd_ins = &prv->rql;
@@ -873,10 +955,59 @@ cpu_add_to_runqueue(struct csched2_private *prv, unsigned
int cpu)
if ( !rqi_unused && rqd->id > rqi )
rqi_unused = true;
+ /*
+ * First of all, let's check whether, according to the system
+ * topology, this CPU belongs in this runqueue.
+ */
if ( cpu_runqueue_match(rqd, cpu) )
{
- rqd_valid = true;
- break;
+ /*
+ * If the CPU has any siblings, they are online and they are
+ * being added to this cpupool, always keep them together. Even
+ * if that means violating what the opt_max_cpus_runqueue param
+ * indicates. However, if this happens, chances are high that a
+ * too small value was used for the parameter, so warn the user
+ * about that.
+ *
+ * Note that we cannot check this once and for all, say, during
+ * scheduler initialization. In fact, at least in theory, the
+ * number of siblings a CPU has may not be the same for all the
+ * CPUs.
+ */
+ if ( cpumask_intersects(&rqd->active, per_cpu(cpu_sibling_mask,
cpu)) )
+ {
+ if ( cpumask_weight(&rqd->active) >= opt_max_cpus_runqueue )
+ {
+ printk("WARNING: %s: more than opt_max_cpus_runqueue "
+ "in a runqueue (%u vs %u), due to topology
constraints.\n"
+ "Consider raising it!\n",
+ __func__, opt_max_cpus_runqueue,
+ cpumask_weight(&rqd->active));
+ }
+ rqd_valid = rqd;
+ break;
+ }
+
+ /*
+ * If we're using core (or socket) scheduling, no need to do any
+ * further checking beyond the number of CPUs already in this
+ * runqueue respecting our upper bound.
+ *
+ * Otherwise, let's try to make sure that siblings stay in the
+ * same runqueue, pretty much under any cinrcumnstances.
+ */
+ if ( rqd->refcnt < max_cpus_runq && (ops->cpupool->gran !=
SCHED_GRAN_cpu ||
+ cpu_runqueue_siblings_match(rqd, cpu, max_cpus_runq)) )
+ {
+ /*
+ * This runqueue is ok, but as we said, we also want an even
+ * distribution of the CPUs. So, unless this is the very first
+ * match, we go on, check all runqueues and actually add the
+ * CPU into the one that is less full.
+ */
+ if ( !rqd_valid || rqd->refcnt < rqd_valid->refcnt )
+ rqd_valid = rqd;
+ }
}
if ( !rqi_unused )
@@ -900,6 +1031,8 @@ cpu_add_to_runqueue(struct csched2_private *prv, unsigned
int cpu)
rqd->pick_bias = cpu;
rqd->id = rqi;
}
+ else
+ rqd = rqd_valid;
rqd->refcnt++;
@@ -3744,7 +3877,6 @@ csched2_dump(const struct scheduler *ops)
static void *
csched2_alloc_pdata(const struct scheduler *ops, int cpu)
{
- struct csched2_private *prv = csched2_priv(ops);
struct csched2_pcpu *spc;
struct csched2_runqueue_data *rqd;
@@ -3754,7 +3886,7 @@ csched2_alloc_pdata(const struct scheduler *ops, int cpu)
if ( spc == NULL )
return ERR_PTR(-ENOMEM);
- rqd = cpu_add_to_runqueue(prv, cpu);
+ rqd = cpu_add_to_runqueue(ops, cpu);
if ( IS_ERR(rqd) )
{
xfree(spc);
diff --git a/xen/include/asm-arm/cpufeature.h b/xen/include/asm-arm/cpufeature.h
index 9af5666628..8fdf9685d7 100644
--- a/xen/include/asm-arm/cpufeature.h
+++ b/xen/include/asm-arm/cpufeature.h
@@ -64,6 +64,11 @@ static inline bool cpus_have_cap(unsigned int num)
return test_bit(num, cpu_hwcaps);
}
+static inline cpu_nr_siblings(unsigned int)
+{
+ return 1;
+}
+
/* System capability check for constant cap */
#define cpus_have_const_cap(num) ({ \
register_t __ret; \
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index 070691882b..73017c3f4b 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -174,6 +174,11 @@ extern void init_intel_cacheinfo(struct cpuinfo_x86 *c);
unsigned int apicid_to_socket(unsigned int);
+static inline int cpu_nr_siblings(unsigned int cpu)
+{
+ return cpu_data[cpu].x86_num_siblings;
+}
+
/*
* Generic CPUID function
* clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
--
generated by git-patchbot for /home/xen/git/xen.git#staging
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |