[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v3 1/4] sched: credit2: respect per-vcpu hard affinity



by making sure that vcpus only run on the pcpu(s) they are allowed to
run on based on their hard affinity cpu masks.

Signed-off-by: Justin T. Weaver <jtweaver@xxxxxxxxxx>
---
Changes in v3:
(all v3 changes are based on v2 review comments)
 * Renamed cpumask to scratch_mask
 * Renamed function get_safe_pcpu to get_fallback_cpu
 * Improved comment for function get_fallback_cpu
 * Replaced cpupool_online_cpumask with VCPU2ONLINE in function
   get_fallback_cpu to shorten the line
 * Added #define for VCPU2ONLINE (probably should be factored out of
   schedule.c and here, and put into a common header)
 * Modified code in function get_fallback_cpu: moved check for current
   processor to the top; added an ASSERT because the mask should not be empty
 * Modified code and comment in function choose_cpu in migrate request section
 * Added comment to function choose_cpu explaining why the vcpu passed to the
   function might not have hard affinity with any of the pcpus in its assigned
   run queue
 * Modified code in function choose_cpu to make it more readable
 * Moved/changed "We didn't find ..." comment in function choose_cpu
 * Combined migration flag check and hard affinity check into valid migration
   check helper function; replaced code in three places in function
   balance_load with call to the helper function
 * Changed a BUG_ON to an ASSERT in function csched2_vcpu_migrate   
 * Moved vc->processor assignment in function csched2_vcpu_migrate to an else
   block to execute only if current and destination run queues are the same;
   Note: without the processor assignment here the vcpu might be assigned to a
   processor it no longer is allowed to run on. In that case, function
   runq_candidate may only get called for the vcpu's old processor, and
   runq_candidate will no longer let a vcpu run on a processor that it's not
   allowed to run on (because of the hard affinity check first introduced in
   v1 of this patch).
 * csched2_init: changed xzalloc_bytes to xmalloc_array for allocation of
   scratch_mask
 * csched2_deinit: removed scratch_mask freeing loop; it was not needed
Changes in v2:
 * Added dynamically allocated cpu masks to avoid putting them on the stack;
   replaced temp masks from v1 throughout
 * Added helper function for code suggested in v1 review and called it in two
   locations in function choose_cpu
 * Removed v1 change to comment in the beginning of choose_cpu
 * Replaced two instances of cpumask_and/cpumask_empty with cpumask_intersects
 * Removed v1 re-work of code in function migrate; only change in migrate in
   v2 is the assignment of a valid pcpu from the destination run queue to
   vc->processor
 * In function csched2_vcpu_migrate: removed change from v1 that called
   function migrate even if cur and dest run queues were the same in order
   to get a runq_tickle call; added processor assignment to new_cpu to fix
   the real underlying issue which was the vcpu not getting a call to
   sched_move_irqs
 * Removed the looping added in v1 in function balance_load; may be added back
   later because it would help to have balance_load be more aware of hard
   affinity, but adding it does not affect credit2's current inability to
   respect hard affinity.
 * Removed coding style fix in function balance_load
 * Improved comment in function runq_candidate
---
 xen/common/sched_credit2.c |  139 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 113 insertions(+), 26 deletions(-)

diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
index 7581731..af716e4 100644
--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -176,6 +176,7 @@ integer_param("sched_credit2_migrate_resist", 
opt_migrate_resist);
 #define c2r(_ops, _cpu)     (CSCHED2_PRIV(_ops)->runq_map[(_cpu)])
 /* CPU to runqueue struct macro */
 #define RQD(_ops, _cpu)     (&CSCHED2_PRIV(_ops)->rqd[c2r(_ops, _cpu)])
+#define VCPU2ONLINE(_v)     cpupool_online_cpumask((_v)->domain->cpupool)
 
 /*
  * Shifts for load average.
@@ -194,6 +195,12 @@ int opt_overload_balance_tolerance=-3;
 integer_param("credit2_balance_over", opt_overload_balance_tolerance);
 
 /*
+ * Use this to avoid having too many cpumask_t structs on the stack
+ */
+static cpumask_t **scratch_mask = NULL;
+#define csched2_cpumask scratch_mask[smp_processor_id()]
+
+/*
  * Per-runqueue data
  */
 struct csched2_runqueue_data {
@@ -268,6 +275,32 @@ struct csched2_dom {
     uint16_t nr_vcpus;
 };
 
+/*
+ * When a hard affinity change occurs, we may not be able to check some or
+ * all of the other run queues for a valid new processor for the given vcpu
+ * because (in function choose_cpu) either the trylock on the private data
+ * failed or the trylock on each run queue with valid processor(s) for svc
+ * failed. In these cases, this function is used to pick a pcpu that svc can
+ * run on. It returns svc's current pcpu if valid, or a different pcpu from
+ * the run queue svc is currently assigned to, or if none of those are valid,
+ * it returns a pcpu from the intersection of svc's hard affinity and the
+ * domain's online cpumask.
+ */
+static int get_fallback_cpu(struct csched2_vcpu *svc)
+{
+    if ( likely(cpumask_test_cpu(svc->vcpu->processor,
+        svc->vcpu->cpu_hard_affinity)) )
+        return svc->vcpu->processor;
+
+    cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
+        &svc->rqd->active);
+    if ( cpumask_empty(csched2_cpumask) )
+        cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
+            VCPU2ONLINE(svc->vcpu));
+    ASSERT( !cpumask_empty(csched2_cpumask) );
+
+    return cpumask_any(csched2_cpumask);
+}
 
 /*
  * Time-to-credit, credit-to-time.
@@ -501,8 +534,9 @@ runq_tickle(const struct scheduler *ops, unsigned int cpu, 
struct csched2_vcpu *
         goto tickle;
     }
     
-    /* Get a mask of idle, but not tickled */
+    /* Get a mask of idle, but not tickled, that new is allowed to run on. */
     cpumask_andnot(&mask, &rqd->idle, &rqd->tickled);
+    cpumask_and(&mask, &mask, new->vcpu->cpu_hard_affinity);
     
     /* If it's not empty, choose one */
     i = cpumask_cycle(cpu, &mask);
@@ -513,9 +547,11 @@ runq_tickle(const struct scheduler *ops, unsigned int cpu, 
struct csched2_vcpu *
     }
 
     /* Otherwise, look for the non-idle cpu with the lowest credit,
-     * skipping cpus which have been tickled but not scheduled yet */
+     * skipping cpus which have been tickled but not scheduled yet,
+     * that new is allowed to run on. */
     cpumask_andnot(&mask, &rqd->active, &rqd->idle);
     cpumask_andnot(&mask, &mask, &rqd->tickled);
+    cpumask_and(&mask, &mask, new->vcpu->cpu_hard_affinity);
 
     for_each_cpu(i, &mask)
     {
@@ -1078,9 +1114,8 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
             d2printk("%pv -\n", svc->vcpu);
             clear_bit(__CSFLAG_runq_migrate_request, &svc->flags);
         }
-        /* Leave it where it is for now.  When we actually pay attention
-         * to affinity we'll have to figure something out... */
-        return vc->processor;
+
+        return get_fallback_cpu(svc);
     }
 
     /* First check to see if we're here because someone else suggested a place
@@ -1091,41 +1126,53 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
         {
             printk("%s: Runqueue migrate aborted because target runqueue 
disappeared!\n",
                    __func__);
-            /* Fall-through to normal cpu pick */
         }
         else
         {
-            d2printk("%pv +\n", svc->vcpu);
-            new_cpu = cpumask_cycle(vc->processor, &svc->migrate_rqd->active);
-            goto out_up;
+            cpumask_and(csched2_cpumask, vc->cpu_hard_affinity,
+                &svc->migrate_rqd->active);
+            new_cpu = cpumask_any(csched2_cpumask);
+            if ( new_cpu < nr_cpu_ids )
+            {
+                d2printk("%pv +\n", svc->vcpu);
+                goto out_up;
+            }
         }
+        /* Fall-through to normal cpu pick */
     }
 
-    /* FIXME: Pay attention to cpu affinity */                                 
                                                     
-
     min_avgload = MAX_LOAD;
 
     /* Find the runqueue with the lowest instantaneous load */
     for_each_cpu(i, &prv->active_queues)
     {
         struct csched2_runqueue_data *rqd;
-        s_time_t rqd_avgload;
+        s_time_t rqd_avgload = MAX_LOAD;
 
         rqd = prv->rqd + i;
 
         /* If checking a different runqueue, grab the lock,
-         * read the avg, and then release the lock.
+         * check hard affinity, read the avg, and then release the lock.
          *
          * If on our own runqueue, don't grab or release the lock;
          * but subtract our own load from the runqueue load to simulate
-         * impartiality */
+         * impartiality.
+         *
+         * svc's hard affinity may have changed; this function is the
+         * credit 2 scheduler's first opportunity to react to the change,
+         * so it is possible here that svc does not have hard affinity
+         * with any of the pcpus of svc's currently assigned run queue.
+         */
         if ( rqd == svc->rqd )
         {
-            rqd_avgload = rqd->b_avgload - svc->avgload;
+            if ( cpumask_intersects(vc->cpu_hard_affinity, &rqd->active) )
+                rqd_avgload = rqd->b_avgload - svc->avgload;
         }
         else if ( spin_trylock(&rqd->lock) )
         {
-            rqd_avgload = rqd->b_avgload;
+            if ( cpumask_intersects(vc->cpu_hard_affinity, &rqd->active) )
+                rqd_avgload = rqd->b_avgload;
+
             spin_unlock(&rqd->lock);
         }
         else
@@ -1138,12 +1185,14 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
         }
     }
 
-    /* We didn't find anyone (most likely because of spinlock contention); 
leave it where it is */
+    /* We didn't find anyone (most likely because of spinlock contention). */
     if ( min_rqi == -1 )
-        new_cpu = vc->processor;
+        new_cpu = get_fallback_cpu(svc);
     else
     {
-        new_cpu = cpumask_cycle(vc->processor, &prv->rqd[min_rqi].active);
+        cpumask_and(csched2_cpumask, vc->cpu_hard_affinity,
+            &prv->rqd[min_rqi].active);
+        new_cpu = cpumask_any(csched2_cpumask);
         BUG_ON(new_cpu >= nr_cpu_ids);
     }
 
@@ -1223,7 +1272,12 @@ static void migrate(const struct scheduler *ops,
             on_runq=1;
         }
         __runq_deassign(svc);
-        svc->vcpu->processor = cpumask_any(&trqd->active);
+
+        cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
+            &trqd->active);
+        svc->vcpu->processor = cpumask_any(csched2_cpumask);
+        BUG_ON(svc->vcpu->processor >= nr_cpu_ids);
+
         __runq_assign(svc, trqd);
         if ( on_runq )
         {
@@ -1237,6 +1291,20 @@ static void migrate(const struct scheduler *ops,
     }
 }
 
+/*
+ * Migration of vcpu svc to run queue rqd is a valid option if svc is not
+ * already flagged to migrate and if svc is allowed to run on at least one of
+ * the pcpus assigned to rqd based on svc's hard affinity mask.
+ */
+static bool_t valid_vcpu_migration(struct csched2_vcpu *svc,
+                                   struct csched2_runqueue_data *rqd)
+{
+    if ( test_bit(__CSFLAG_runq_migrate_request, &svc->flags)
+        || !cpumask_intersects(svc->vcpu->cpu_hard_affinity, &rqd->active) )
+        return 0;
+    else
+        return 1;
+}
 
 static void balance_load(const struct scheduler *ops, int cpu, s_time_t now)
 {
@@ -1345,8 +1413,7 @@ retry:
 
         __update_svc_load(ops, push_svc, 0, now);
 
-        /* Skip this one if it's already been flagged to migrate */
-        if ( test_bit(__CSFLAG_runq_migrate_request, &push_svc->flags) )
+        if ( !valid_vcpu_migration(push_svc, st.orqd) )
             continue;
 
         list_for_each( pull_iter, &st.orqd->svc )
@@ -1358,8 +1425,7 @@ retry:
                 __update_svc_load(ops, pull_svc, 0, now);
             }
         
-            /* Skip this one if it's already been flagged to migrate */
-            if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
+            if ( !valid_vcpu_migration(pull_svc, st.lrqd) )
                 continue;
 
             consider(&st, push_svc, pull_svc);
@@ -1375,8 +1441,7 @@ retry:
     {
         struct csched2_vcpu * pull_svc = list_entry(pull_iter, struct 
csched2_vcpu, rqd_elem);
         
-        /* Skip this one if it's already been flagged to migrate */
-        if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) )
+        if ( !valid_vcpu_migration(pull_svc, st.lrqd) )
             continue;
 
         /* Consider pull only */
@@ -1415,11 +1480,14 @@ csched2_vcpu_migrate(
 
     /* Check if new_cpu is valid */
     BUG_ON(!cpumask_test_cpu(new_cpu, &CSCHED2_PRIV(ops)->initialized));
+    ASSERT(cpumask_test_cpu(new_cpu, vc->cpu_hard_affinity));
 
     trqd = RQD(ops, new_cpu);
 
     if ( trqd != svc->rqd )
         migrate(ops, svc, trqd, NOW());
+    else
+        vc->processor = new_cpu;
 }
 
 static int
@@ -1638,6 +1706,10 @@ runq_candidate(struct csched2_runqueue_data *rqd,
     {
         struct csched2_vcpu * svc = list_entry(iter, struct csched2_vcpu, 
runq_elem);
 
+        /* Only consider vcpus that are allowed to run on this processor. */
+        if ( !cpumask_test_cpu(cpu, svc->vcpu->cpu_hard_affinity) )
+            continue;
+
         /* If this is on a different processor, don't pull it unless
          * its credit is at least CSCHED2_MIGRATE_RESIST higher. */
         if ( svc->vcpu->processor != cpu
@@ -2024,6 +2096,13 @@ csched2_alloc_pdata(const struct scheduler *ops, int cpu)
         printk("%s: cpu %d not online yet, deferring initializatgion\n",
                __func__, cpu);
 
+    /*
+     * For each new pcpu, allocate a cpumask_t for use throughout the
+     * scheduler to avoid putting any cpumask_t structs on the stack.
+     */
+    if ( !zalloc_cpumask_var(&scratch_mask[cpu]) )
+        return NULL;
+
     return (void *)1;
 }
 
@@ -2072,6 +2151,8 @@ csched2_free_pdata(const struct scheduler *ops, void 
*pcpu, int cpu)
 
     spin_unlock_irqrestore(&prv->lock, flags);
 
+    free_cpumask_var(scratch_mask[cpu]);
+
     return;
 }
 
@@ -2159,6 +2240,10 @@ csched2_init(struct scheduler *ops)
 
     prv->load_window_shift = opt_load_window_shift;
 
+    scratch_mask = xmalloc_array(cpumask_t *, nr_cpu_ids);
+    if ( scratch_mask == NULL )
+        return -ENOMEM;
+
     return 0;
 }
 
@@ -2169,6 +2254,8 @@ csched2_deinit(const struct scheduler *ops)
 
     prv = CSCHED2_PRIV(ops);
     xfree(prv);
+
+    xfree(scratch_mask);
 }
 
 
-- 
1.7.10.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.