[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v3 4/4] sched: credit2: consider per-vcpu soft affinity



when making decisions for vcpus (run queue assignment, run queue migration,
cpu assignment, and cpu tickling).

Added soft affinity balancing loops to...
 * get_fallback_cpu
 * runq_tickle (one for idle, but not tickled; one for non-idle, and not
   tickled)
 * choose_cpu

choose_cpu now tries to find the run queue with the most cpus in the given
vcpu's soft affinity. It uses minimum run queue load as a tie breaker.

Added a function to determine the number of soft cpus gained (or lost) by a
given vcpu if it is migrated from a given source run queue to a given
destination run queue.

Modified algorithm in balance_load and consider...
 * if the load on lrqd and/or orqd is less than the number of their active
   cpus, balance_load will look for vcpus that would have their soft affinity
   improved by being pushed and/or pulled. Load does not need to be considered
   since a run queue recieveing a pushed or pulled vcpu is not being fully
   utilized. This returns vcpus that may have been migrated away from their
   soft affinity due to load balancing back to their soft affinity.
 * in consider, vcpus that might be picked for migration because pushing or
   pulling them decreases the load delta are not picked if their current run
   queue's load is less than its active cpu count and if that migration would
   harm their soft affinity. There's no need to push/pull if the load is under
   capacity, and the vcpu would lose access to some or all of its soft cpus.
 * in consider, if a push/pull/swap migration decreases the load delta by a
   similar amount to another push/pull/swap migration, then use soft cpu gain
   as a tie breaker. This allows load to continue to balance across run queues,
   but favors soft affinity gains if the load deltas are close.

Signed-off-by: Justin T. Weaver <jtweaver@xxxxxxxxxx>
---
Changes in v3:
 * get_fallback_cpu: added balance loop to try to find a soft affinity cpu 
 * runq_tickle: replaced use of local var mask with csched2_cpumask
 * runq_tickle: added two balance loops, one for finding idle, but not
   tickled, and other for finding non-idle with lowest credit
 * choose_cpu: added balance loop to find cpu for given vcpu that has most
   soft cpus (with run queue load being a tie breaker), or if none were found,
   or not considering soft affinity, pick cpu from runq with least load
 * balance_load / consider: removed code that ignored a migration if it meant
   moving a vcpu away from its soft affinity; added migration of vcpus to
   improve their soft affinity if the destination run queue was under load;
   added check in consider, if current run queue is under load and migration
   would hurt the vcpu's soft affinity, do not consider the migration; added
   soft affinity tie breaker in consider if current load delta and consider
   load delta are close
 * added helper functions for soft affinity related changes to balance_load
Changes in v2:
 * Not submitted in version 2; focus was on the hard affinity patch
---
 xen/common/sched_credit2.c |  344 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 313 insertions(+), 31 deletions(-)

diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
index bbcfbf2..47d0bad 100644
--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -127,6 +127,14 @@
 #define CSCHED2_CREDIT_RESET         0
 /* Max timer: Maximum time a guest can be run for. */
 #define CSCHED2_MAX_TIMER            MILLISECS(2)
+/* Used in balance_load to specify migration direction. */
+#define CSCHED2_PULL                 0
+#define CSCHED2_PUSH                 1
+/*
+ * Used in balance_load to decide if deltas are close enough to use soft
+ * affinity as a tie breaker.
+ */
+#define CSCHED2_DIVIDE_BY_16         4
 
 
 #define CSCHED2_IDLE_CREDIT                 (-(1<<30))
@@ -288,15 +296,33 @@ struct csched2_dom {
  */
 static int get_fallback_cpu(struct csched2_vcpu *svc)
 {
+    int balance_step;
+
     if ( likely(cpumask_test_cpu(svc->vcpu->processor,
         svc->vcpu->cpu_hard_affinity)) )
         return svc->vcpu->processor;
 
-    cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
-        &svc->rqd->active);
-    if ( cpumask_empty(csched2_cpumask) )
-        cpumask_and(csched2_cpumask, svc->vcpu->cpu_hard_affinity,
-            VCPU2ONLINE(svc->vcpu));
+    for_each_sched_balance_step( balance_step )
+    {
+        if ( balance_step == SCHED_BALANCE_SOFT_AFFINITY
+            && !__vcpu_has_soft_affinity(svc->vcpu,
+                svc->vcpu->cpu_hard_affinity) )
+            continue;
+
+        sched_balance_cpumask(svc->vcpu, balance_step, csched2_cpumask);
+        cpumask_and(csched2_cpumask, csched2_cpumask, &svc->rqd->active);
+        if ( !cpumask_empty(csched2_cpumask) )
+            break;
+        else
+        {
+            sched_balance_cpumask(svc->vcpu, balance_step, csched2_cpumask);
+            cpumask_and(csched2_cpumask, csched2_cpumask,
+                VCPU2ONLINE(svc->vcpu));
+            if ( !cpumask_empty(csched2_cpumask) )
+                break;
+        }
+    }
+
     ASSERT( !cpumask_empty(csched2_cpumask) );
 
     return cpumask_any(csched2_cpumask);
@@ -516,8 +542,8 @@ runq_tickle(const struct scheduler *ops, unsigned int cpu, 
struct csched2_vcpu *
     int i, ipid=-1;
     s_time_t lowest=(1<<30);
     struct csched2_runqueue_data *rqd = RQD(ops, cpu);
-    cpumask_t mask;
     struct csched2_vcpu * cur;
+    int balance_step;
 
     d2printk("rqt %pv curr %pv\n", new->vcpu, current);
 
@@ -534,26 +560,43 @@ runq_tickle(const struct scheduler *ops, unsigned int 
cpu, struct csched2_vcpu *
         goto tickle;
     }
     
+    for_each_sched_balance_step ( balance_step )
+    {
+        if ( balance_step == SCHED_BALANCE_SOFT_AFFINITY
+            && !__vcpu_has_soft_affinity(new->vcpu,
+                new->vcpu->cpu_hard_affinity) )
+            continue;
+
         /* Get a mask of idle, but not tickled, that new is allowed to run on. 
*/
-        cpumask_andnot(&mask, &rqd->idle, &rqd->tickled);
-        cpumask_and(&mask, &mask, new->vcpu->cpu_hard_affinity);
-    
+        sched_balance_cpumask(new->vcpu, balance_step, csched2_cpumask);
+        cpumask_and(csched2_cpumask, csched2_cpumask, &rqd->idle);
+        cpumask_andnot(csched2_cpumask, csched2_cpumask, &rqd->tickled);
+
         /* If it's not empty, choose one */
-        i = cpumask_cycle(cpu, &mask);
+        i = cpumask_cycle(cpu, csched2_cpumask);
         if ( i < nr_cpu_ids )
         {
             ipid = i;
             goto tickle;
         }
+    }
 
     /* Otherwise, look for the non-idle cpu with the lowest credit,
      * skipping cpus which have been tickled but not scheduled yet,
      * that new is allowed to run on. */
-        cpumask_andnot(&mask, &rqd->active, &rqd->idle);
-        cpumask_andnot(&mask, &mask, &rqd->tickled);
-        cpumask_and(&mask, &mask, new->vcpu->cpu_hard_affinity);
+    for_each_sched_balance_step ( balance_step )
+    {
+        if ( balance_step == SCHED_BALANCE_SOFT_AFFINITY
+            && !__vcpu_has_soft_affinity(new->vcpu,
+                new->vcpu->cpu_hard_affinity) )
+            continue;
+
+        sched_balance_cpumask(new->vcpu, balance_step, csched2_cpumask);
+        cpumask_and(csched2_cpumask, csched2_cpumask, &rqd->active);
+        cpumask_andnot(csched2_cpumask, csched2_cpumask, &rqd->idle);
+        cpumask_andnot(csched2_cpumask, csched2_cpumask, &rqd->tickled);
 
-        for_each_cpu(i, &mask)
+        for_each_cpu(i, csched2_cpumask)
         {
             struct csched2_vcpu * cur;
 
@@ -586,6 +629,7 @@ runq_tickle(const struct scheduler *ops, unsigned int cpu, 
struct csched2_vcpu *
                           sizeof(d),
                           (unsigned char *)&d);
             }
+        }
     }
 
     /* Only switch to another processor if the credit difference is greater
@@ -1086,7 +1130,7 @@ static int
 choose_cpu(const struct scheduler *ops, struct vcpu *vc)
 {
     struct csched2_private *prv = CSCHED2_PRIV(ops);
-    int i, min_rqi = -1, new_cpu;
+    int i, rqi = -1, new_cpu, max_soft_cpus = 0, balance_step;
     struct csched2_vcpu *svc = CSCHED2_VCPU(vc);
     s_time_t min_avgload;
 
@@ -1143,9 +1187,28 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
 
     min_avgload = MAX_LOAD;
 
-    /* Find the runqueue with the lowest instantaneous load */
+    /*
+     * Find the run queue with the most cpus in vc's soft affinity. If there
+     * is more than one queue with the highest soft affinity cpu count, then
+     * pick the one with the lowest instantaneous run queue load. If the
+     * vcpu does not have soft affinity, then only try to find the run queue
+     * with the lowest instantaneous load.
+     */
+    for_each_sched_balance_step( balance_step )
+    {
+        if ( balance_step == SCHED_BALANCE_SOFT_AFFINITY
+            && !__vcpu_has_soft_affinity(vc, vc->cpu_hard_affinity) )
+            continue;
+
+        if ( balance_step == SCHED_BALANCE_HARD_AFFINITY && rqi > -1 )
+        {
+            balance_step = SCHED_BALANCE_SOFT_AFFINITY;
+            break;
+        }
+
         for_each_cpu(i, &prv->active_queues)
         {
+            int rqd_soft_cpus = 0;
             struct csched2_runqueue_data *rqd;
             s_time_t rqd_avgload = MAX_LOAD;
 
@@ -1163,35 +1226,61 @@ choose_cpu(const struct scheduler *ops, struct vcpu *vc)
              * so it is possible here that svc does not have hard affinity
              * with any of the pcpus of svc's currently assigned run queue.
              */
+            sched_balance_cpumask(vc, balance_step, csched2_cpumask);
             if ( rqd == svc->rqd )
             {
-                if ( cpumask_intersects(vc->cpu_hard_affinity, &rqd->active) )
+                if ( cpumask_intersects(csched2_cpumask, &rqd->active) )
                     rqd_avgload = rqd->b_avgload - svc->avgload;
+                if ( balance_step == SCHED_BALANCE_SOFT_AFFINITY )
+                {
+                    cpumask_and(csched2_cpumask, csched2_cpumask,
+                        &rqd->active);
+                    rqd_soft_cpus = cpumask_weight(csched2_cpumask);
+                }
             }
             else if ( spin_trylock(&rqd->lock) )
             {
-                if ( cpumask_intersects(vc->cpu_hard_affinity, &rqd->active) )
+                if ( cpumask_intersects(csched2_cpumask, &rqd->active) )
                     rqd_avgload = rqd->b_avgload;
+                if ( balance_step == SCHED_BALANCE_SOFT_AFFINITY )
+                {
+                    cpumask_and(csched2_cpumask, csched2_cpumask,
+                        &rqd->active);
+                    rqd_soft_cpus = cpumask_weight(csched2_cpumask);
+                }
 
                 spin_unlock(&rqd->lock);
             }
             else
                 continue;
 
-            if ( rqd_avgload < min_avgload )
+            if ( balance_step == SCHED_BALANCE_SOFT_AFFINITY
+                && rqd_soft_cpus > 0
+                && ( rqd_soft_cpus > max_soft_cpus
+                    ||
+                   ( rqd_soft_cpus == max_soft_cpus
+                    && rqd_avgload < min_avgload )) )
+            {
+                max_soft_cpus = rqd_soft_cpus;
+                rqi = i;
+                min_avgload = rqd_avgload;
+            }
+            else if ( balance_step == SCHED_BALANCE_HARD_AFFINITY
+                     && rqd_avgload < min_avgload )
             {
+                rqi = i;
                 min_avgload = rqd_avgload;
-                min_rqi=i;
             }
+        }
     }
 
     /* We didn't find anyone (most likely because of spinlock contention). */
-    if ( min_rqi == -1 )
+    if ( rqi == -1 )
         new_cpu = get_fallback_cpu(svc);
     else
     {
-        cpumask_and(csched2_cpumask, vc->cpu_hard_affinity,
-            &prv->rqd[min_rqi].active);
+        sched_balance_cpumask(vc, balance_step, csched2_cpumask);
+        cpumask_and(csched2_cpumask, csched2_cpumask, &prv->rqd[rqi].active);
         new_cpu = cpumask_any(csched2_cpumask);
         BUG_ON(new_cpu >= nr_cpu_ids);
     }
@@ -1207,15 +1296,75 @@ typedef struct {
     /* NB: Modified by consider() */
     s_time_t load_delta;
     struct csched2_vcpu * best_push_svc, *best_pull_svc;
+    int soft_affinity_boost;
+    bool_t valid_sa_boost;
     /* NB: Read by consider() */
     struct csched2_runqueue_data *lrqd;
     struct csched2_runqueue_data *orqd;                  
 } balance_state_t;
 
+/*
+ * Return the number of pcpus gained in vc's soft affinity mask that vc can
+ * run on if vc is migrated from run queue src_rqd to run queue dst_rqd.
+ */
+static int get_soft_affinity_gain(const struct vcpu *vc,
+                                  const struct csched2_runqueue_data *src_rqd,
+                                  const struct csched2_runqueue_data *dst_rqd)
+{
+    /*
+     * Locks must already be held for src_rqd and dst_rqd.
+     * Function assumes vc has at least hard affinity with one or more
+     * pcpus in both the source and destination run queues.
+     */
+
+    /* Does vcpu not have soft affinity? */
+    if ( !__vcpu_has_soft_affinity(vc, vc->cpu_hard_affinity) )
+        return 0;
+
+    /* Does vcpu have soft affinity with pcpu(s) in the destination runq? */
+    sched_balance_cpumask(vc, SCHED_BALANCE_SOFT_AFFINITY, csched2_cpumask);
+    if ( cpumask_intersects(csched2_cpumask, &dst_rqd->active) )
+    {
+        int soft_cpus_dst;
+        cpumask_and(csched2_cpumask, csched2_cpumask, &dst_rqd->active);
+        soft_cpus_dst = cpumask_weight(csched2_cpumask);
+
+        /* ... and soft affinity with the source run queue? */
+        sched_balance_cpumask(vc, SCHED_BALANCE_SOFT_AFFINITY,
+            csched2_cpumask);
+        if ( cpumask_intersects(csched2_cpumask, &src_rqd->active) )
+        {
+            int soft_cpus_src;
+            cpumask_and(csched2_cpumask, csched2_cpumask, &src_rqd->active);
+            soft_cpus_src = cpumask_weight(csched2_cpumask);
+
+            /* Soft affinity to soft affinity migration. */
+            return soft_cpus_dst - soft_cpus_src;
+        }
+        else
+            /* Hard affinity to soft affinity migration. */
+            return soft_cpus_dst;
+    }
+    else
+    {
+        int soft_cpus_src = 0;
+        cpumask_and(csched2_cpumask, csched2_cpumask, &src_rqd->active);
+        soft_cpus_src = cpumask_weight(csched2_cpumask);
+
+        /*
+         * Hard affinity to hard affinity migration or soft affinity to hard
+         * affinity migration.
+         */
+        return -soft_cpus_src;
+    }
+}
+
 static void consider(balance_state_t *st, 
                      struct csched2_vcpu *push_svc,
-                     struct csched2_vcpu *pull_svc)
+                     struct csched2_vcpu *pull_svc,
+                     int load_window_shift)
 {
+    int delta_diff;
     s_time_t l_load, o_load, delta;
 
     l_load = st->lrqd->b_avgload;
@@ -1237,12 +1386,88 @@ static void consider(balance_state_t *st,
     if ( delta < 0 )
         delta = -delta;
 
-    if ( delta < st->load_delta )
+    /*
+     * Use soft affinity gain as a tie breaker if at least one migration has
+     * already been picked and stored in the balance state, and the absolute
+     * value of the difference between the delta in st and the new delta being
+     * considered here is less than 1/16th of the load_window_shift.
+     */
+    delta_diff = delta - st->load_delta;
+    if ( delta_diff < 0 )
+        delta_diff = -delta_diff;
+    if ( (st->best_push_svc != NULL || st->best_pull_svc != NULL)
+        && delta_diff < 1<<(load_window_shift - CSCHED2_DIVIDE_BY_16) )
     {
-        st->load_delta = delta;
-        st->best_push_svc=push_svc;
-        st->best_pull_svc=pull_svc;
+        int st_soft_gain = 0, consider_soft_gain = 0;
+
+        /* Find the soft affinity gain for the migration in st. */
+        if ( !st->valid_sa_boost )
+            if ( st->best_push_svc )
+                st_soft_gain += get_soft_affinity_gain(
+                    st->best_push_svc->vcpu, st->lrqd, st->orqd);
+            if ( st->best_pull_svc )
+                st_soft_gain += get_soft_affinity_gain(
+                    st->best_pull_svc->vcpu, st->orqd, st->lrqd);
+        else
+            st_soft_gain = st->soft_affinity_boost;
+
+        /* Find the soft affinity gain for the migration being considered. */
+        if ( push_svc )
+        {
+            int push_soft_gain = get_soft_affinity_gain(
+                push_svc->vcpu, st->lrqd, st->orqd);
+            if ( push_soft_gain < 0
+                && st->lrqd->load < cpumask_weight(&st->lrqd->active) )
+                return;
+            consider_soft_gain = push_soft_gain;
+        }
+        if ( pull_svc )
+        {
+            int pull_soft_gain = get_soft_affinity_gain(
+                pull_svc->vcpu, st->orqd, st->lrqd);
+            if ( pull_soft_gain < 0
+                && st->orqd->load < cpumask_weight(&st->orqd->active) )
+                return;
+            consider_soft_gain += pull_soft_gain;
+        }
+
+        /* Store the higher gain in the balance state. */
+        st->soft_affinity_boost = consider_soft_gain > st_soft_gain ?
+            consider_soft_gain : st_soft_gain;
+        st->valid_sa_boost = 1;
+
+        if ( consider_soft_gain > st_soft_gain )
+            goto choose_migration;
+        else if ( st_soft_gain > consider_soft_gain )
+            return;
+
+        /* Soft affinity gain is the same; fall through. */
     }
+
+    /* Only consider load delta. */
+    if ( delta < st->load_delta )
+        st->valid_sa_boost = 0;
+
+        /*
+         * If the migration results in a loss of some or all soft cpus and the
+         * vcpu's current run queue has less load than physical processors, do
+         * not use the migration.
+         */
+        if ( push_svc &&
+            (st->lrqd->load < cpumask_weight(&st->lrqd->active) &&
+             get_soft_affinity_gain(push_svc->vcpu, st->lrqd, st->orqd) < 0) )
+            return;
+        if ( pull_svc &&
+            (st->orqd->load < cpumask_weight(&st->orqd->active) &&
+             get_soft_affinity_gain(pull_svc->vcpu, st->orqd, st->lrqd) < 0) )
+            return;
+    else
+        return;
+
+choose_migration:
+    st->load_delta = delta;
+    st->best_push_svc=push_svc;
+    st->best_pull_svc=pull_svc;
 }
 
 
@@ -1291,6 +1516,31 @@ static void migrate(const struct scheduler *ops,
     }
 }
 
+/* Returns true if migration should be picked regardless of run queue load. */
+static bool_t consider_soft_affinity(balance_state_t *st,
+                                     struct csched2_vcpu *svc,
+                                     int direction)
+{
+    if ( direction == CSCHED2_PUSH )
+    {
+        if ( get_soft_affinity_gain(svc->vcpu, st->lrqd, st->orqd) > 0 )
+        {
+            st->best_push_svc = svc;
+            return 1;
+        }
+    }
+    else if ( direction == CSCHED2_PULL )
+    {
+        if ( get_soft_affinity_gain(svc->vcpu, st->orqd, st->lrqd) > 0 )
+        {
+            st->best_pull_svc = svc;
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
 /*
  * Migration of vcpu svc to run queue rqd is a valid option if svc is not
  * already flagged to migrate and if svc is allowed to run on at least one of
@@ -1333,6 +1583,7 @@ retry:
         return;
 
     st.load_delta = 0;
+    st.soft_affinity_boost = 0;
 
     for_each_cpu(i, &prv->active_queues)
     {
@@ -1356,6 +1607,36 @@ retry:
             max_delta_rqi = i;
         }
 
+        /*
+         * If run queue load on lrqd and/or orqd is less than their active
+         * cpu counts, then look for any vcpus that can improve their
+         * soft affinity with a push and/or pull migration. Load does not
+         * need to be considered here.
+         */
+        if ( st.orqd->load < cpumask_weight(&st.orqd->active) )
+            list_for_each( push_iter, &st.lrqd->svc )
+            {
+                struct csched2_vcpu * push_svc = list_entry(
+                    push_iter, struct csched2_vcpu, rqd_elem);
+
+                if ( consider_soft_affinity(&st, push_svc, CSCHED2_PUSH) )
+                    break;
+            }
+        if ( st.lrqd->load < cpumask_weight(&st.lrqd->active) )
+            list_for_each( pull_iter, &st.orqd->svc )
+            {
+                struct csched2_vcpu * pull_svc = list_entry(
+                    pull_iter, struct csched2_vcpu, rqd_elem);
+
+                if ( consider_soft_affinity(&st, pull_svc, CSCHED2_PULL) )
+                    break;
+            }
+        if ( st.best_pull_svc != NULL || st.best_push_svc != NULL )
+        {
+            spin_unlock(&prv->lock);
+            goto migrate;
+        }
+
         spin_unlock(&st.orqd->lock);
     }
 
@@ -1428,13 +1709,13 @@ retry:
             if ( !valid_vcpu_migration(pull_svc, st.lrqd) )
                 continue;
 
-            consider(&st, push_svc, pull_svc);
+            consider(&st, push_svc, pull_svc, prv->load_window_shift);
         }
 
         inner_load_updated = 1;
 
         /* Consider push only */
-        consider(&st, push_svc, NULL);
+        consider(&st, push_svc, NULL, prv->load_window_shift);
     }
 
     list_for_each( pull_iter, &st.orqd->svc )
@@ -1445,9 +1726,10 @@ retry:
             continue;
 
         /* Consider pull only */
-        consider(&st, NULL, pull_svc);
+        consider(&st, NULL, pull_svc, prv->load_window_shift);
     }
 
+migrate:
     /* OK, now we have some candidates; do the moving */
     if ( st.best_push_svc )
         migrate(ops, st.best_push_svc, st.orqd, now);
-- 
1.7.10.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.