[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH RESEND 05/12] xen: numa-sched: make space for per-vcpu node-affinity
Before this change, each vcpu had its own vcpu-affinity (also called pinning), while the whole domain had a NUMA node-affinity. Of course, as the (credit) scheduler schedules vcpus and not whole domains, this means that all the vcpus of a domain had the same NUMA node-affinity. This change is the first step toward overcoming such limitation. It adds the data structures for storing the node-affinity on a per-vcpu basis (along with allocating and initializing it). As far as this change only is concerned, there is no specific way to change the node-affinity of a vcpu to something which is not automatically computed (basing on its vcpu-affinity). Such logic is being introduced in subsequent commits. Also, now that each vcpu has its own node-affinity, and in case the domain's node-affinity is set to 'automatically computed', we build it up as the union of all the node-affinities of all the vcpus of the domain. Signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx> --- xen/common/domain.c | 39 ++++++++++++++++++++++++++++++++----- xen/common/keyhandler.c | 6 +++++- xen/common/schedule.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++ xen/include/xen/sched.h | 10 +++++++++ 4 files changed, 99 insertions(+), 6 deletions(-) diff --git a/xen/common/domain.c b/xen/common/domain.c index af31ab4..8d2ff49 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -128,6 +128,7 @@ struct vcpu *alloc_vcpu( if ( !zalloc_cpumask_var(&v->cpu_affinity) || !zalloc_cpumask_var(&v->cpu_affinity_tmp) || !zalloc_cpumask_var(&v->cpu_affinity_saved) || + !zalloc_cpumask_var(&v->node_affinity) || !zalloc_cpumask_var(&v->vcpu_dirty_cpumask) ) goto fail_free; @@ -159,6 +160,7 @@ struct vcpu *alloc_vcpu( free_cpumask_var(v->cpu_affinity); free_cpumask_var(v->cpu_affinity_tmp); free_cpumask_var(v->cpu_affinity_saved); + free_cpumask_var(v->node_affinity); free_cpumask_var(v->vcpu_dirty_cpumask); free_vcpu_struct(v); return NULL; @@ -353,7 +355,7 @@ void domain_update_node_affinity(struct domain *d) cpumask_var_t online_affinity; const cpumask_t *online; struct vcpu *v; - unsigned int node; + unsigned int cpu; if ( !zalloc_cpumask_var(&cpumask) ) return; @@ -367,9 +369,36 @@ void domain_update_node_affinity(struct domain *d) spin_lock(&d->node_affinity_lock); + /* + * Let's prepare the cpumask that will be used below to actually update + * the node-affinity of the whole domain. Each vcpu has a vcpu-affinity + * and a numa-affinity. What gets built in cpumask (and used below) is + * the union of all the (online) cpus in all the vcpu's numa-affinity + * masks. + * + * On its turn, the numa-affinity mask of the i-eth vcpu (say, 'v') is + * either derived directly from the vcpu's vcpu-affinity mask (in case + * v->auto_node_affinity is true) or has its own value, (potentially) + * completely independent from v->cpu_affinity. In the former case, it + * is here that we make sure the two affinity masks matches (since this + * function gets called in correspondence of each modification to + * v->cpu_affinity happening in vcpu_set_affinity() ); in the latter + * case, we just leave v->node_affinity alone. + */ for_each_vcpu ( d, v ) { - cpumask_and(online_affinity, v->cpu_affinity, online); + if ( v->auto_node_affinity ) + { + cpumask_clear(v->node_affinity); + for_each_cpu ( cpu, v->cpu_affinity ) + cpumask_or(v->node_affinity, v->node_affinity, + &node_to_cpumask(cpu_to_node(cpu))); + + cpumask_and(online_affinity, v->node_affinity, online); + } + else + cpumask_copy(online_affinity, v->node_affinity); + cpumask_or(cpumask, cpumask, online_affinity); } @@ -383,9 +412,8 @@ void domain_update_node_affinity(struct domain *d) if ( d->auto_node_affinity ) { nodes_clear(d->node_affinity); - for_each_online_node ( node ) - if ( cpumask_intersects(&node_to_cpumask(node), cpumask) ) - node_set(node, d->node_affinity); + for_each_cpu ( cpu, cpumask ) + node_set(cpu_to_node(cpu), d->node_affinity); } sched_set_node_affinity(d, &d->node_affinity); @@ -734,6 +762,7 @@ static void complete_domain_destroy(struct rcu_head *head) { free_cpumask_var(v->cpu_affinity); free_cpumask_var(v->cpu_affinity_tmp); + free_cpumask_var(v->node_affinity); free_cpumask_var(v->vcpu_dirty_cpumask); free_vcpu_struct(v); } diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c index 8e4b3f8..8d5e8b2 100644 --- a/xen/common/keyhandler.c +++ b/xen/common/keyhandler.c @@ -297,7 +297,11 @@ static void dump_domains(unsigned char key) cpuset_print(tmpstr, sizeof(tmpstr), v->vcpu_dirty_cpumask); printk("dirty_cpus=%s ", tmpstr); cpuset_print(tmpstr, sizeof(tmpstr), v->cpu_affinity); - printk("cpu_affinity=%s\n", tmpstr); + printk("cpu_affinity=%s ", tmpstr); + cpuset_print(tmpstr, sizeof(tmpstr), v->node_affinity); + printk("node_affinity=%s%s\n", + v->auto_node_affinity ? "(auto)" : "(manual)", + tmpstr); printk(" pause_count=%d pause_flags=%lx\n", atomic_read(&v->pause_count), v->pause_flags); arch_dump_vcpu_info(v); diff --git a/xen/common/schedule.c b/xen/common/schedule.c index 0f45f07..b3966ad 100644 --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -198,6 +198,9 @@ int sched_init_vcpu(struct vcpu *v, unsigned int processor) else cpumask_setall(v->cpu_affinity); + v->auto_node_affinity = 1; + cpumask_copy(v->node_affinity, v->cpu_affinity); + /* Initialise the per-vcpu timers. */ init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, v, v->processor); @@ -684,6 +687,53 @@ int vcpu_set_affinity(struct vcpu *v, const cpumask_t *affinity) return 0; } +int vcpu_set_node_affinity(struct vcpu *v, const nodemask_t *nodes) +{ + nodemask_t online_nodes; + int node; + + nodes_and(online_nodes, node_online_map, *nodes); + + /* Having no affinity at all is just wrong */ + if ( nodes_empty(online_nodes) ) + return -EINVAL; + + spin_lock(&v->domain->node_affinity_lock); + + /* + * Explicitly saying "all nodes" is not particularly useful here. + * Let's use it as the `reset numa-affinity to auto' command. + */ + if ( nodes_full(*nodes) ) + { + v->auto_node_affinity = 1; + goto out; + } + + /* + * When someone asks for a specific numa-affinity for a vcpu we need to + * clear auto_node_affinity, convert the nodemask in online_nodes + * into a cpumask_t and store it in node_affinity. + */ + v->auto_node_affinity = 0; + + cpumask_clear(v->node_affinity); + for_each_node_mask( node, online_nodes ) + cpumask_or(v->node_affinity, v->node_affinity, + &node_to_cpumask(node)); + +out: + spin_unlock(&v->domain->node_affinity_lock); + + /* + * Changing the numa-affinity of a vcpu calls for an update + * of the node-affinity of the whole domain. + */ + domain_update_node_affinity(v->domain); + + return 0; +} + /* Block the currently-executing domain until a pertinent event occurs. */ void vcpu_block(void) { diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index 25bf637..732d6b6 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -172,6 +172,8 @@ struct vcpu /* VCPU need affinity restored */ bool_t affinity_broken; + /* is node_affinity (below) automatically computed from vcpu-affinity? */ + bool_t auto_node_affinity; /* * > 0: a single port is being polled; @@ -197,6 +199,13 @@ struct vcpu /* Used to restore affinity across S3. */ cpumask_var_t cpu_affinity_saved; + /* + * Bitmask of CPUs on which this VCPU prefers to run. For both this + * and auto_node_affinity access is serialized against + * v->domain->node_affinity_lock. + */ + cpumask_var_t node_affinity; + /* Bitmask of CPUs which are holding onto this VCPU's state. */ cpumask_var_t vcpu_dirty_cpumask; @@ -740,6 +749,7 @@ int schedule_cpu_switch(unsigned int cpu, struct cpupool *c); void vcpu_force_reschedule(struct vcpu *v); int cpu_disable_scheduler(unsigned int cpu); int vcpu_set_affinity(struct vcpu *v, const cpumask_t *affinity); +int vcpu_set_node_affinity(struct vcpu *v, const nodemask_t *nodes); void restore_vcpu_affinity(struct domain *d); void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate); _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |