[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH] credi2-ratelimit: Implement rate limit for credit2 scheduler



On Wed, Jul 6, 2016 at 6:33 PM, Makkar anshul.makkar@xxxxxxxxxx <Anshul> wrote:
> From: Anshul Makkar <anshul.makkar@xxxxxxxxxx>
>
> Rate limit assures that a vcpu will execute for a minimum amount of time 
> before
> being put at the back of a queue or being preempted by higher priority thread.
>
> It introduces a minimum amount of latency to enable a VM to batch its work and
> it also ensures that system is not spending most of its time in
> VMEXIT/VMENTRY because of VM that is waking/sleeping at high rate.
>
> ratelimit can be disabled by setting it to 0.
>
> Signed-off-by: Anshul Makkar <anshul.makkar@xxxxxxxxxx>

Hey Anshul!  Thanks for submitting this.  A few comments.

> ---
> ---
>  xen/common/sched_credit2.c | 115 
> ++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 98 insertions(+), 17 deletions(-)
>
> diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
> index 1933ff1..6718574 100644
> --- a/xen/common/sched_credit2.c
> +++ b/xen/common/sched_credit2.c
> @@ -171,6 +171,11 @@ integer_param("sched_credit2_migrate_resist", 
> opt_migrate_resist);
>  #define c2r(_ops, _cpu)     (CSCHED2_PRIV(_ops)->runq_map[(_cpu)])
>  /* CPU to runqueue struct macro */
>  #define RQD(_ops, _cpu)     (&CSCHED2_PRIV(_ops)->rqd[c2r(_ops, _cpu)])
> +/* Find the max of time slice */
> +#define MAX_TSLICE(t1, t2)  \
> +                   ({ typeof (t1) _t1 = (t1); \
> +                      typeof (t1) _t2 = (t2); \
> +                      _t1 > _t2 ? _t1 < 0 ? 0 : _t1 : _t2 < 0 ? 0 : _t2; })

Hiding the zero-checking behind this seems a bit strange to me.  I
think if the algorithm is properly laid out, we probably don't need
this at all (see my proposed alternate below).

>
>  /*
>   * Shifts for load average.
> @@ -280,6 +285,7 @@ struct csched2_private {
>      struct csched2_runqueue_data rqd[NR_CPUS];
>
>      unsigned int load_window_shift;
> +    unsigned ratelimit_us; /* each cpupool can have its onw ratelimit */
>  };
>
>  /*
> @@ -1588,6 +1594,34 @@ csched2_dom_cntl(
>      return rc;
>  }
>
> +static int csched2_sys_cntl(const struct scheduler *ops,
> +                            struct xen_sysctl_scheduler_op *sc)
> +{
> +    int rc = -EINVAL;
> +    xen_sysctl_credit_schedule_t *params = &sc->u.sched_credit;
> +    struct csched2_private *prv = CSCHED2_PRIV(ops);
> +    unsigned long flags;
> +
> +    switch (sc->cmd )
> +    {
> +        case XEN_SYSCTL_SCHEDOP_putinfo:
> +            if ( params->ratelimit_us &&
> +                ( params->ratelimit_us < CSCHED2_MIN_TIMER ||
> +                  params->ratelimit_us > MICROSECS(CSCHED2_MAX_TIMER) ))
> +                return rc;
> +            spin_lock_irqsave(&prv->lock, flags);
> +            prv->ratelimit_us = params->ratelimit_us;
> +            spin_unlock_irqrestore(&prv->lock, flags);
> +            break;
> +
> +        case XEN_SYSCTL_SCHEDOP_getinfo:
> +            params->ratelimit_us = prv->ratelimit_us;
> +            rc = 0;
> +            break;
> +    }
> +    return rc;
> +}
> +
>  static void *
>  csched2_alloc_domdata(const struct scheduler *ops, struct domain *dom)
>  {
> @@ -1657,12 +1691,15 @@ csched2_dom_destroy(const struct scheduler *ops, 
> struct domain *dom)
>
>  /* How long should we let this vcpu run for? */
>  static s_time_t
> -csched2_runtime(const struct scheduler *ops, int cpu, struct csched2_vcpu 
> *snext)
> +csched2_runtime(const struct scheduler *ops, int cpu,
> +                struct csched2_vcpu *snext, s_time_t now)
>  {
> -    s_time_t time;
> +    s_time_t time;
>      int rt_credit; /* Proposed runtime measured in credits */
>      struct csched2_runqueue_data *rqd = RQD(ops, cpu);
>      struct list_head *runq = &rqd->runq;
> +    s_time_t runtime = 0;
> +    struct csched2_private *prv = CSCHED2_PRIV(ops);
>
>      /*
>       * If we're idle, just stay so. Others (or external events)
> @@ -1680,6 +1717,14 @@ csched2_runtime(const struct scheduler *ops, int cpu, 
> struct csched2_vcpu *snext
>
>      /* 1) Basic time: Run until credit is 0. */
>      rt_credit = snext->credit;
> +    if (snext->vcpu->is_running)
> +        runtime = now - snext->vcpu->runstate.state_entry_time;
> +    if ( runtime < 0 )
> +    {
> +        runtime = 0;
> +        d2printk("%s: Time went backwards? now %"PRI_stime" state_entry_time 
> %"PRI_stime"\n",
> +                  _func__, now, snext->runstate.state_entry_time);
> +    }
>
>      /* 2) If there's someone waiting whose credit is positive,
>       * run until your credit ~= his */
> @@ -1695,11 +1740,24 @@ csched2_runtime(const struct scheduler *ops, int cpu, 
> struct csched2_vcpu *snext
>      }
>
>      /* The next guy may actually have a higher credit, if we've tried to
> -     * avoid migrating him from a different cpu.  DTRT.  */
> +     * avoid migrating him from a different cpu.  DTRT.
> +     * Even if the next guy has higher credit and current vcpu has executed
> +     * for less amount of time than rate limit, allow it to run for minimum
> +     * amount of time.
> +     */
>      if ( rt_credit <= 0 )
>      {
> -        time = CSCHED2_MIN_TIMER;
> -        SCHED_STAT_CRANK(runtime_min_timer);
> +        if ( snext->vcpu->is_running && prv->ratelimit_us)
> +           /* implies the current one has executed for time < ratelimit and 
> thus
> +            * it has neen selcted int runq_candidate to run next.
> +            * No need to check for this condition again.
> +            */
> +            time = MAX_TSLICE(CSCHED2_MIN_TIMER,
> +                               MICROSECS(prv->ratelimit_us) - runtime);
> +        else
> +            time = MAX_TSLICE(CSCHED2_MIN_TIMER, 
> MICROSECS(prv->ratelimit_us));
> +
> +        SCHED_STAT_CRANK(time);
>      }
>      else
>      {
> @@ -1709,17 +1767,22 @@ csched2_runtime(const struct scheduler *ops, int cpu, 
> struct csched2_vcpu *snext
>           * at a different rate. */
>          time = c2t(rqd, rt_credit, snext);
>
> -        /* Check limits */
> -        if ( time < CSCHED2_MIN_TIMER )
> -        {
> -            time = CSCHED2_MIN_TIMER;
> -            SCHED_STAT_CRANK(runtime_min_timer);
> -        }
> -        else if ( time > CSCHED2_MAX_TIMER )
> +        /* Check limits.
> +         * time > ratelimit --> time > MIN.
> +         */
> +        if ( time > CSCHED2_MAX_TIMER )
>          {
> +
>              time = CSCHED2_MAX_TIMER;
>              SCHED_STAT_CRANK(runtime_max_timer);
>          }
> +        else
> +        {
> +            time = MAX_TSLICE(MAX_TSLICE(CSCHED2_MIN_TIMER,
> +                                          MICROSECS(prv->ratelimit_us)), 
> time);
> +            SCHED_STAT_CRANK(time);
> +        }
> +
>      }

This is all really confusing.  What about something like this (also
attached)?  The basic idea is to calculate min_time, then go through
the normal algorithm, then clip it once at the end.

@@ -1657,12 +1691,14 @@ csched2_dom_destroy(const struct scheduler
*ops, struct domain *dom)

 /* How long should we let this vcpu run for? */
 static s_time_t
-csched2_runtime(const struct scheduler *ops, int cpu, struct
csched2_vcpu *snext)
+csched2_runtime(const struct scheduler *ops, int cpu,
+                struct csched2_vcpu *snext, s_time_t now)
 {
-    s_time_t time;
+    s_time_t time, min_time;
     int rt_credit; /* Proposed runtime measured in credits */
     struct csched2_runqueue_data *rqd = RQD(ops, cpu);
     struct list_head *runq = &rqd->runq;
+    struct csched2_private *prv = CSCHED2_PRIV(ops);

     /*
      * If we're idle, just stay so. Others (or external events)
@@ -1675,9 +1711,19 @@ csched2_runtime(const struct scheduler *ops,
int cpu, struct csched2_vcpu *snext
      * 1) Run until snext's credit will be 0
      * 2) But if someone is waiting, run until snext's credit is equal
      * to his
-     * 3) But never run longer than MAX_TIMER or shorter than MIN_TIMER.
+     * 3) But never run longer than MAX_TIMER or shorter than MIN_TIMER
+     * or your the ratelimit time.
      */

+    /* Calculate mintime */
+    min_time = CSCHED2_MIN_TIMER;
+    if ( prv->ratelimit_us ) {
+        s_time_t ratelimit_min = snext->vcpu->runstate.state_entry_time +
+            MICROSECS(prv->ratelimit_us) - now;
+        if ( ratelimit_min > min_time )
+            min_time = ratelimit_min;
+    }
+
     /* 1) Basic time: Run until credit is 0. */
     rt_credit = snext->credit;

@@ -1694,32 +1740,33 @@ csched2_runtime(const struct scheduler *ops,
int cpu, struct csched2_vcpu *snext
         }
     }

-    /* The next guy may actually have a higher credit, if we've tried to
-     * avoid migrating him from a different cpu.  DTRT.  */
-    if ( rt_credit <= 0 )
+    /*
+     * The next guy on the runqueue may actually have a higher credit,
+     * if we've tried to avoid migrating him from a different cpu.
+     * Setting time=0 will ensure the minimum timeslice is chosen.
+     *
+     * FIXME: See if we can eliminate this conversion if we know time
+     * will be outside (MIN,MAX).  Probably requires pre-calculating
+     * credit values of MIN,MAX per vcpu, since each vcpu burns credit
+     * at a different rate.
+     */
+    if ( rt_credit > 0 )
+        time = c2t(rqd, rt_credit, snext);
+    else
+        time = 0;
+
+
+    /* 3) But never run longer than MAX_TIMER or shorter than MIN_TIMER
+     * or your ratelimit time */
+    if ( time < min_time )
     {
-        time = CSCHED2_MIN_TIMER;
+        time = min_time;
         SCHED_STAT_CRANK(runtime_min_timer);
     }
-    else
+    else if ( time > CSCHED2_MAX_TIMER )
     {
-        /* FIXME: See if we can eliminate this conversion if we know time
-         * will be outside (MIN,MAX).  Probably requires pre-calculating
-         * credit values of MIN,MAX per vcpu, since each vcpu burns credit
-         * at a different rate. */
-        time = c2t(rqd, rt_credit, snext);
-
-        /* Check limits */
-        if ( time < CSCHED2_MIN_TIMER )
-        {
-            time = CSCHED2_MIN_TIMER;
-            SCHED_STAT_CRANK(runtime_min_timer);
-        }
-        else if ( time > CSCHED2_MAX_TIMER )
-        {
-            time = CSCHED2_MAX_TIMER;
-            SCHED_STAT_CRANK(runtime_max_timer);
-        }
+        time = CSCHED2_MAX_TIMER;
+        SCHED_STAT_CRANK(runtime_max_timer);
     }

     return time;


>
>      return time;
> @@ -1733,7 +1796,7 @@ void __dump_execstate(void *unused);
>  static struct csched2_vcpu *
>  runq_candidate(struct csched2_runqueue_data *rqd,
>                 struct csched2_vcpu *scurr,
> -               int cpu, s_time_t now)
> +               int cpu, s_time_t now, struct csched2_private *prv)

Since we have the cpu, I think we can get ops this way, without
cluttering things up with the extra argument:

    struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu));

>  {
>      struct list_head *iter;
>      struct csched2_vcpu *snext = NULL;
> @@ -1744,6 +1807,16 @@ runq_candidate(struct csched2_runqueue_data *rqd,
>      else
>          snext = CSCHED2_VCPU(idle_vcpu[cpu]);
>
> +    /* return the current vcpu if it has executed for less than ratelimit.
> +     * Adjuststment for the selected vcpu's credit and decision
> +     * for how long it will run will be taken in csched2_runtime.
> +     */
> +    if ( prv->ratelimit_us && !is_idle_vcpu(scurr->vcpu) &&
> +         vcpu_runnable(scurr->vcpu) &&
> +         (now - scurr->vcpu->runstate.state_entry_time) <
> +          MICROSECS(prv->ratelimit_us) )
> +        return scurr;
> +

This looks OK.  The comment should probably follow the general hypervisor style:
/*
 * blah blah blah
 */

I realize most of the file isn't like this, but that was probably a mistake. :-)

>      list_for_each( iter, &rqd->runq )
>      {
>          struct csched2_vcpu * svc = list_entry(iter, struct csched2_vcpu, 
> runq_elem);
> @@ -1762,9 +1835,13 @@ runq_candidate(struct csched2_runqueue_data *rqd,
>          }
>
>          /* If the next one on the list has more credit than current
> -         * (or idle, if current is not runnable), choose it. */
> +         * (or idle, if current is not runnable) and current one has already
> +         * executed for more than ratelimit. choose it.
> +         * Control has reached here means that current vcpu has executed >
> +         * ratelimit_us or ratelimit is off, so chose the next one.
> +         */
>          if ( svc->credit > snext->credit )
> -            snext = svc;
> +                snext = svc;

I agree with Dario; this comment change isn't very helpful.

>
>          /* In any case, if we got this far, break. */
>          break;
> @@ -1787,6 +1864,7 @@ csched2_schedule(
>      struct csched2_vcpu * const scurr = CSCHED2_VCPU(current);
>      struct csched2_vcpu *snext = NULL;
>      struct task_slice ret;
> +    struct csched2_private *prv = CSCHED2_PRIV(ops);
>
>      SCHED_STAT_CRANK(schedule);
>      CSCHED2_VCPU_CHECK(current);
> @@ -1857,7 +1935,7 @@ csched2_schedule(
>          snext = CSCHED2_VCPU(idle_vcpu[cpu]);
>      }
>      else
> -        snext=runq_candidate(rqd, scurr, cpu, now);
> +        snext=runq_candidate(rqd, scurr, cpu, now, prv);
>
>      /* If switching from a non-idle runnable vcpu, put it
>       * back on the runqueue. */
> @@ -1921,7 +1999,7 @@ csched2_schedule(
>      /*
>       * Return task to run next...
>       */
> -    ret.time = csched2_runtime(ops, cpu, snext);
> +    ret.time = csched2_runtime(ops, cpu, snext, now);
>      ret.task = snext->vcpu;
>
>      CSCHED2_VCPU_CHECK(ret.task);
> @@ -2353,6 +2431,8 @@ csched2_init(struct scheduler *ops)
>          prv->runq_map[i] = -1;
>          prv->rqd[i].id = -1;
>      }
> +    /* initialize ratelimit */
> +    prv->ratelimit_us = 1000 * CSCHED2_MIN_TIMER;

Is there any reason this isn't using sched_ratelimit_us (the
hypervisor command-line option, which the credit scheduler is using)?

Thanks again!

 -George

Attachment: example.diff
Description: Text document

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.