[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH RFC v2 1/4] xen: add real time scheduler rt



On 29/07/14 02:52, Meng Xu wrote:
> This scheduler follows the pre-emptive Global EDF theory in real-time field.
> Each VCPU can have a dedicated period and budget.
> While scheduled, a VCPU burns its budget.
> A VCPU has its budget replenished at the beginning of each of its periods;
> The VCPU discards its unused budget at the end of each of its periods.
> If a VCPU runs out of budget in a period, it has to wait until next period.
> The mechanism of how to burn a VCPU's budget depends on the server mechanism
> implemented for each VCPU.
>
> Server mechanism: a VCPU is implemented as a deferable server.
> When a VCPU is scheduled to execute on a PCPU, its budget is continuously
> burned.
>
> Priority scheme: Preemptive Global Earliest Deadline First (gEDF).
> At any scheduling point, the VCPU with earliest deadline has highest
> priority.
>
> Queue scheme: A global Runqueue for each CPU pool.
> The Runqueue holds all runnable VCPUs.
> VCPUs in the Runqueue are divided into two parts: with and without budget.
> At each part, VCPUs are sorted based on gEDF priority scheme.
>
> Scheduling quantum: 1 ms;
>
> Note: cpumask and cpupool is supported.
>
> This is still in the development phase.
>
> Signed-off-by: Sisu Xi <xisisu@xxxxxxxxx>
> Signed-off-by: Meng Xu <mengxu@xxxxxxxxxxxxx>
> ---
>  xen/common/Makefile         |    1 +
>  xen/common/sched_rt.c       | 1058 
> +++++++++++++++++++++++++++++++++++++++++++
>  xen/common/schedule.c       |    4 +-
>  xen/include/public/domctl.h |   28 +-
>  xen/include/xen/sched-if.h  |    1 +
>  5 files changed, 1089 insertions(+), 3 deletions(-)
>  create mode 100644 xen/common/sched_rt.c
>
> diff --git a/xen/common/Makefile b/xen/common/Makefile
> index 3683ae3..5a23aa4 100644
> --- a/xen/common/Makefile
> +++ b/xen/common/Makefile
> @@ -26,6 +26,7 @@ obj-y += sched_credit.o
>  obj-y += sched_credit2.o
>  obj-y += sched_sedf.o
>  obj-y += sched_arinc653.o
> +obj-y += sched_rt.o
>  obj-y += schedule.o
>  obj-y += shutdown.o
>  obj-y += softirq.o
> diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
> new file mode 100644
> index 0000000..6cfbb8a
> --- /dev/null
> +++ b/xen/common/sched_rt.c
> @@ -0,0 +1,1058 @@
> +/******************************************************************************
> + * Preemptive Global Earliest Deadline First  (EDF) scheduler for Xen
> + * EDF scheduling is one of most popular real-time scheduling algorithm used 
> in
> + * embedded field.
> + *
> + * by Sisu Xi, 2013, Washington University in Saint Louis
> + * and Meng Xu, 2014, University of Pennsylvania
> + *
> + * based on the code of credit Scheduler
> + */
> +
> +#include <xen/config.h>
> +#include <xen/init.h>
> +#include <xen/lib.h>
> +#include <xen/sched.h>
> +#include <xen/domain.h>
> +#include <xen/delay.h>
> +#include <xen/event.h>
> +#include <xen/time.h>
> +#include <xen/perfc.h>
> +#include <xen/sched-if.h>
> +#include <xen/softirq.h>
> +#include <asm/atomic.h>
> +#include <xen/errno.h>
> +#include <xen/trace.h>
> +#include <xen/cpu.h>
> +#include <xen/keyhandler.h>
> +#include <xen/trace.h>
> +#include <xen/guest_access.h>
> +
> +/*
> + * TODO:
> + *
> + * Migration compensation and resist like credit2 to better use cache;
> + * Lock Holder Problem, using yield?
> + * Self switch problem: VCPUs of the same domain may preempt each other;
> + */
> +
> +/*
> + * Design:
> + *
> + * This scheduler follows the Preemptive Global EDF theory in real-time 
> field.
> + * Each VCPU can have a dedicated period and budget. 
> + * While scheduled, a VCPU burns its budget.
> + * A VCPU has its budget replenished at the beginning of each of its periods;
> + * The VCPU discards its unused budget at the end of each of its periods.
> + * If a VCPU runs out of budget in a period, it has to wait until next 
> period.
> + * The mechanism of how to burn a VCPU's budget depends on the server 
> mechanism
> + * implemented for each VCPU.
> + *
> + * Server mechanism: a VCPU is implemented as a deferable server.
> + * When a VCPU has a task running on it, its budget is continuously burned;
> + * When a VCPU has no task but with budget left, its budget is preserved.
> + *
> + * Priority scheme: Preemptive Global Earliest Deadline First (gEDF).
> + * At any scheduling point, the VCPU with earliest deadline has highest 
> priority.
> + *
> + * Queue scheme: A global runqueue for each CPU pool. 
> + * The runqueue holds all runnable VCPUs. 
> + * VCPUs in the runqueue are divided into two parts: with and without 
> remaining budget. 
> + * At each part, VCPUs are sorted based on EDF priority scheme.
> + *
> + * Scheduling quanta: 1 ms; but accounting the budget is in microsecond.
> + *
> + * Note: cpumask and cpupool is supported.
> + */
> +
> +/*
> + * Locking:
> + * Just like credit2, a global system lock is used to protect the RunQ.
> + * The global lock is referenced by schedule_data.schedule_lock from all 
> physical cpus.
> + *
> + * The lock is already grabbed when calling wake/sleep/schedule/ functions 
> in schedule.c
> + *
> + * The functions involes RunQ and needs to grab locks are:
> + *    dump, vcpu_insert, vcpu_remove, context_saved,
> + */
> +
> +
> +/*
> + * Default parameters: Period and budget in default is 10 and 4 ms, 
> respectively
> + */
> +#define RT_DEFAULT_PERIOD     (MICROSECS(10))
> +#define RT_DEFAULT_BUDGET     (MICROSECS(4))
> +
> +/*
> + * Useful macros
> + */
> +#define RT_PRIV(_ops)     ((struct rt_private *)((_ops)->sched_data))
> +#define RT_VCPU(_vcpu)    ((struct rt_vcpu *)(_vcpu)->sched_priv)
> +#define RT_DOM(_dom)      ((struct rt_dom *)(_dom)->sched_priv)
> +#define RUNQ(_ops)              (&RT_PRIV(_ops)->runq)

I know you are copying the prevailing style, but these macros are nasty.

They would be perfectly fine as static inlines with some real types...

static inline struct rt_private *RT_PRIV(const scheduler *ops)
{
    return ops->sched_data;
}

... which allow for rather more useful compiler errors in the case that
they get accidentally misused.

> +
> +/*
> + * Flags
> + */
> +/* RT_scheduled: Is this vcpu either running on, or context-switching off,
> + * a phyiscal cpu?
> + * + Accessed only with Runqueue lock held.
> + * + Set when chosen as next in rt_schedule().
> + * + Cleared after context switch has been saved in rt_context_saved()
> + * + Checked in vcpu_wake to see if we can add to the Runqueue, or if we 
> should
> + *   set RT_delayed_runq_add
> + * + Checked to be false in runq_insert.
> + */
> +#define __RT_scheduled            1
> +#define RT_scheduled (1<<__RT_scheduled)
> +/* RT_delayed_runq_add: Do we need to add this to the Runqueueu once it'd 
> done 
> + * being context switching out?
> + * + Set when scheduling out in rt_schedule() if prev is runable
> + * + Set in rt_vcpu_wake if it finds RT_scheduled set
> + * + Read in rt_context_saved(). If set, it adds prev to the Runqueue and
> + *   clears the bit.
> + *
> + */
> +#define __RT_delayed_runq_add     2
> +#define RT_delayed_runq_add (1<<__RT_delayed_runq_add)
> +
> +/*
> + * Debug only. Used to printout debug information
> + */
> +#define printtime()\
> +        ({s_time_t now = NOW(); \
> +          printk("%u : %3ld.%3ldus : %-19s ",\
> +          smp_processor_id(), now/MICROSECS(1), now%MICROSECS(1)/1000, 
> __func__);} )
> +
> +/*
> + * Systme-wide private data, include a global RunQueue
> + * The global lock is referenced by schedule_data.schedule_lock from all 
> physical cpus.
> + * It can be grabbed via vcpu_schedule_lock_irq()
> + */
> +struct rt_private {
> +    spinlock_t lock;        /* The global coarse grand lock */
> +    struct list_head sdom;  /* list of availalbe domains, used for dump */
> +    struct list_head runq;  /* Ordered list of runnable VMs */
> +    cpumask_t cpus;         /* cpumask_t of available physical cpus */
> +    cpumask_t tickled;      /* another cpu in the queue already ticked this 
> one */
> +};
> +
> +/*
> + * Virtual CPU
> + */
> +struct rt_vcpu {
> +    struct list_head runq_elem; /* On the runqueue list */
> +    struct list_head sdom_elem; /* On the domain VCPU list */
> +
> +    /* Up-pointers */
> +    struct rt_dom *sdom;
> +    struct vcpu *vcpu;
> +
> +    /* VCPU parameters, in milliseconds */
> +    s_time_t period;
> +    s_time_t budget;
> +
> +    /* VCPU current infomation in nanosecond */
> +    long cur_budget;             /* current budget */
> +    s_time_t last_start;        /* last start time */
> +    s_time_t cur_deadline;      /* current deadline for EDF */
> +
> +    unsigned flags;             /* mark __RT_scheduled, etc.. */
> +};
> +
> +/*
> + * Domain
> + */
> +struct rt_dom {
> +    struct list_head vcpu;      /* link its VCPUs */
> +    struct list_head sdom_elem; /* link list on rt_priv */
> +    struct domain *dom;         /* pointer to upper domain */
> +};
> +
> +/*
> + * RunQueue helper functions
> + */
> +static int
> +__vcpu_on_runq(struct rt_vcpu *svc)
> +{
> +   return !list_empty(&svc->runq_elem);
> +}
> +
> +static struct rt_vcpu *
> +__runq_elem(struct list_head *elem)
> +{
> +    return list_entry(elem, struct rt_vcpu, runq_elem);
> +}
> +
> +/*
> + * Debug related code, dump vcpu/cpu information
> + */
> +static void
> +rt_dump_vcpu(struct rt_vcpu *svc)

const struct rc_vcpu *svc, for added safety.  (In the past we have had a
dump function which accidentally clobbered some of the state it was
supposed to be reading)

> +{
> +    char *cpustr = keyhandler_scratch;

Xen style - newline between declarations and code.

The keyhandler_scratch is only safe to use in keyhandlers, yet your dump
functions are based on scheduling operations.  You risk concurrent
access with other dump functions and with keyhandlers.

> +    if ( svc == NULL ) 
> +    {
> +        printk("NULL!\n");
> +        return;
> +    }

This is quite a useless message on its own.  Is it reasonable for svc to
ever be NULL here?

> +    cpumask_scnprintf(cpustr, sizeof(cpustr), svc->vcpu->cpu_hard_affinity);

Buggy sizeof.  sizeof(cpustr) is 4, where I suspect you mean
sizeof(keyscratch_handler) which is 1024.

> +    printk("[%5d.%-2d] cpu %d, (%"PRId64", %"PRId64"), cur_b=%"PRId64" 
> cur_d=%"PRId64" last_start=%"PRId64" onR=%d runnable=%d cpu_hard_affinity=%s 
> ",
> +            svc->vcpu->domain->domain_id,
> +            svc->vcpu->vcpu_id,
> +            svc->vcpu->processor,

vcpu_id and processor are both unsigned quantities, so should be
formatted with %u rather thatn %d

> +            svc->period,
> +            svc->budget,
> +            svc->cur_budget,
> +            svc->cur_deadline,
> +            svc->last_start,
> +            __vcpu_on_runq(svc),
> +            vcpu_runnable(svc->vcpu),
> +            cpustr);
> +    memset(cpustr, 0, sizeof(char)*1024);

This memset is quite pointless, and buggy if keyscratch handler changes
length.  cpumask_scnprintf() properly NULL terminates its string.

> +    cpumask_scnprintf(cpustr, sizeof(cpustr), 
> cpupool_scheduler_cpumask(svc->vcpu->domain->cpupool));

Same buggy sizeof.

> +    printk("cpupool=%s\n", cpustr);
> +}
> +
> +static void
> +rt_dump_pcpu(const struct scheduler *ops, int cpu)
> +{
> +    struct rt_vcpu *svc = RT_VCPU(curr_on_cpu(cpu));
> +
> +    printtime();
> +    rt_dump_vcpu(svc);
> +}
> +
> +/*
> + * should not need lock here. only showing stuff 
> + */
> +static void
> +rt_dump(const struct scheduler *ops)
> +{
> +    struct list_head *iter_sdom, *iter_svc, *runq, *iter;
> +    struct rt_private *prv = RT_PRIV(ops);
> +    struct rt_vcpu *svc;
> +    int cpu = 0;
> +    int loop = 0;

Both of these should be unsigned.

> +
> +    printtime();
> +    printk("Priority Scheme: EDF\n");
> +
> +    printk("PCPU info: \n");

Please remove these trailing spaces before newlines.  All they do is
waste space in the console ring.

> +    for_each_cpu(cpu, &prv->cpus) 
> +        rt_dump_pcpu(ops, cpu);
> +
> +    printk("Global RunQueue info: \n");
> +    loop = 0;
> +    runq = RUNQ(ops);
> +    list_for_each( iter, runq ) 
> +    {
> +        svc = __runq_elem(iter);
> +        printk("\t%3d: ", ++loop);
> +        rt_dump_vcpu(svc);
> +    }
> +
> +    printk("Domain info: \n");
> +    loop = 0;
> +    list_for_each( iter_sdom, &prv->sdom ) 
> +    {
> +        struct rt_dom *sdom;
> +        sdom = list_entry(iter_sdom, struct rt_dom, sdom_elem);
> +        printk("\tdomain: %d\n", sdom->dom->domain_id);
> +
> +        list_for_each( iter_svc, &sdom->vcpu ) 
> +        {
> +            svc = list_entry(iter_svc, struct rt_vcpu, sdom_elem);
> +            printk("\t\t%3d: ", ++loop);
> +            rt_dump_vcpu(svc);
> +        }
> +    }
> +
> +    printk("\n");
> +}
> +
> +static inline void
> +__runq_remove(struct rt_vcpu *svc)
> +{
> +    if ( __vcpu_on_runq(svc) )
> +        list_del_init(&svc->runq_elem);
> +}
> +
> +/*
> + * Insert a vcpu in the RunQ based on vcpus' deadline: 
> + * EDF schedule policy: vcpu with smaller deadline has higher priority;
> + * The vcpu svc to be inserted will be inserted just before the very first 
> + * vcpu iter_svc in the Runqueue whose deadline is equal or larger than 
> + * svc's deadline.
> + */
> +static void
> +__runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
> +{
> +    struct list_head *runq = RUNQ(ops);
> +    struct list_head *iter;
> +    spinlock_t *schedule_lock;
> +    
> +    schedule_lock = per_cpu(schedule_data, 
> svc->vcpu->processor).schedule_lock;
> +    ASSERT( spin_is_locked(schedule_lock) );
> +    
> +    /* Debug only */
> +    if ( __vcpu_on_runq(svc) )
> +    {
> +        rt_dump(ops);
> +    }
> +    ASSERT( !__vcpu_on_runq(svc) );
> +
> +    list_for_each(iter, runq) 
> +    {
> +        struct rt_vcpu * iter_svc = __runq_elem(iter);
> +
> +        /* svc still has budget */
> +        if ( svc->cur_budget > 0 ) 
> +        {
> +            if ( iter_svc->cur_budget == 0 ||
> +                 svc->cur_deadline <= iter_svc->cur_deadline )
> +                    break;
> +        } 
> +        else 
> +        { /* svc has no budget */
> +            if ( iter_svc->cur_budget == 0 &&
> +                 svc->cur_deadline <= iter_svc->cur_deadline )
> +                    break;
> +        }
> +    }
> +
> +    list_add_tail(&svc->runq_elem, iter);
> +}
> +
> +/*
> + * Init/Free related code
> + */
> +static int
> +rt_init(struct scheduler *ops)
> +{
> +    struct rt_private *prv = xzalloc(struct rt_private);
> +
> +    if ( prv == NULL )
> +        return -ENOMEM;

Newline in here.

> +    ops->sched_data = prv;

Is it safe to set ops->sched_data with a half constructed rt_private?  I
suspect this wants to be the very last (non-debug) action in this function.

> +    spin_lock_init(&prv->lock);
> +    INIT_LIST_HEAD(&prv->sdom);
> +    INIT_LIST_HEAD(&prv->runq);
> +
> +    printtime();
> +    printk("\n");
> +
> +    return 0;
> +}
> +
> +static void
> +rt_deinit(const struct scheduler *ops)
> +{
> +    struct rt_private *prv = RT_PRIV(ops);
> +
> +    printtime();
> +    printk("\n");
> +    xfree(prv);
> +}
> +
> +/* 
> + * point per_cpu spinlock to the global system lock; all cpu have same 
> global system lock 
> + */
> +static void *
> +rt_alloc_pdata(const struct scheduler *ops, int cpu)
> +{
> +    struct rt_private *prv = RT_PRIV(ops);
> +
> +    cpumask_set_cpu(cpu, &prv->cpus);
> +
> +    per_cpu(schedule_data, cpu).schedule_lock = &prv->lock;
> +
> +    printtime();
> +    printk("%s total cpus: %d", __FUNCTION__, cpumask_weight(&prv->cpus));

__FUNCTION__ is a gccism.  __func__ is a standard way of doing the same.

> +    /* same as credit2, not a bogus pointer */
> +    return (void *)1;
> +}
> +
> +static void
> +rt_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
> +{
> +    struct rt_private * prv = RT_PRIV(ops);
> +    cpumask_clear_cpu(cpu, &prv->cpus);
> +    printtime();
> +    printk("%s cpu=%d\n", __FUNCTION__, cpu);
> +}
> +
> +static void *
> +rt_alloc_domdata(const struct scheduler *ops, struct domain *dom)
> +{
> +    unsigned long flags;
> +    struct rt_dom *sdom;
> +    struct rt_private * prv = RT_PRIV(ops);
> +
> +    printtime();
> +    printk("dom=%d\n", dom->domain_id);
> +
> +    sdom = xzalloc(struct rt_dom);
> +    if ( sdom == NULL ) 
> +    {
> +        printk("%s, xzalloc failed\n", __func__);
> +        return NULL;
> +    }
> +
> +    INIT_LIST_HEAD(&sdom->vcpu);
> +    INIT_LIST_HEAD(&sdom->sdom_elem);
> +    sdom->dom = dom;
> +
> +    /* spinlock here to insert the dom */
> +    spin_lock_irqsave(&prv->lock, flags);
> +    list_add_tail(&sdom->sdom_elem, &(prv->sdom));
> +    spin_unlock_irqrestore(&prv->lock, flags);
> +
> +    return (void *)sdom;

Bogus cast.

> +}
> +
> +static void
> +rt_free_domdata(const struct scheduler *ops, void *data)
> +{
> +    unsigned long flags;
> +    struct rt_dom *sdom = data;
> +    struct rt_private *prv = RT_PRIV(ops);
> +
> +    printtime();
> +    printk("dom=%d\n", sdom->dom->domain_id);
> +
> +    spin_lock_irqsave(&prv->lock, flags);
> +    list_del_init(&sdom->sdom_elem);
> +    spin_unlock_irqrestore(&prv->lock, flags);
> +    xfree(data);
> +}
> +
> +static int
> +rt_dom_init(const struct scheduler *ops, struct domain *dom)
> +{
> +    struct rt_dom *sdom;
> +
> +    printtime();
> +    printk("dom=%d\n", dom->domain_id);
> +
> +    /* IDLE Domain does not link on rt_private */
> +    if ( is_idle_domain(dom) ) 
> +        return 0;
> +
> +    sdom = rt_alloc_domdata(ops, dom);
> +    if ( sdom == NULL ) 
> +    {
> +        printk("%s, failed\n", __func__);
> +        return -ENOMEM;
> +    }
> +    dom->sched_priv = sdom;
> +
> +    return 0;
> +}
> +
> +static void
> +rt_dom_destroy(const struct scheduler *ops, struct domain *dom)
> +{
> +    printtime();
> +    printk("dom=%d\n", dom->domain_id);
> +
> +    rt_free_domdata(ops, RT_DOM(dom));
> +}
> +
> +static void *
> +rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
> +{
> +    struct rt_vcpu *svc;
> +    s_time_t now = NOW();
> +    long count;
> +
> +    /* Allocate per-VCPU info */
> +    svc = xzalloc(struct rt_vcpu);
> +    if ( svc == NULL ) 
> +    {
> +        printk("%s, xzalloc failed\n", __func__);
> +        return NULL;
> +    }
> +
> +    INIT_LIST_HEAD(&svc->runq_elem);
> +    INIT_LIST_HEAD(&svc->sdom_elem);
> +    svc->flags = 0U;
> +    svc->sdom = dd;
> +    svc->vcpu = vc;
> +    svc->last_start = 0;            /* init last_start is 0 */
> +
> +    svc->period = RT_DEFAULT_PERIOD;
> +    if ( !is_idle_vcpu(vc) )
> +        svc->budget = RT_DEFAULT_BUDGET;
> +
> +    count = (now/MICROSECS(svc->period)) + 1;
> +    /* sync all VCPU's start time to 0 */
> +    svc->cur_deadline += count * MICROSECS(svc->period);
> +
> +    svc->cur_budget = svc->budget*1000; /* counting in microseconds level */
> +    /* Debug only: dump new vcpu's info */
> +    printtime();
> +    rt_dump_vcpu(svc);
> +
> +    return svc;
> +}
> +
> +static void
> +rt_free_vdata(const struct scheduler *ops, void *priv)
> +{
> +    struct rt_vcpu *svc = priv;
> +
> +    /* Debug only: dump freed vcpu's info */
> +    printtime();
> +    rt_dump_vcpu(svc);
> +    xfree(svc);
> +}
> +
> +/*
> + * TODO: Do we need to add vc to the new Runqueue?
> + * This function is called in sched_move_domain() in schedule.c
> + * When move a domain to a new cpupool, 
> + * may have to add vc to the Runqueue of the new cpupool
> + */
> +static void
> +rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
> +{
> +    struct rt_vcpu *svc = RT_VCPU(vc);
> +
> +    /* Debug only: dump info of vcpu to insert */
> +    printtime();
> +    rt_dump_vcpu(svc);
> +
> +    /* not addlocate idle vcpu to dom vcpu list */
> +    if ( is_idle_vcpu(vc) )
> +        return;
> +
> +    list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);   /* add to dom vcpu 
> list */
> +}
> +
> +/*
> + * TODO: same as rt_vcpu_insert()
> + */
> +static void
> +rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
> +{
> +    struct rt_vcpu * const svc = RT_VCPU(vc);
> +    struct rt_dom * const sdom = svc->sdom;
> +
> +    printtime();
> +    rt_dump_vcpu(svc);
> +
> +    BUG_ON( sdom == NULL );
> +    BUG_ON( __vcpu_on_runq(svc) );
> +
> +    if ( !is_idle_vcpu(vc) ) 
> +        list_del_init(&svc->sdom_elem);
> +}
> +
> +/* 
> + * Pick a valid CPU for the vcpu vc
> + * Valid CPU of a vcpu is intesection of vcpu's affinity and available cpus
> + */
> +static int
> +rt_cpu_pick(const struct scheduler *ops, struct vcpu *vc)
> +{
> +    cpumask_t cpus;
> +    cpumask_t *online;
> +    int cpu;
> +    struct rt_private * prv = RT_PRIV(ops);
> +
> +    online = cpupool_scheduler_cpumask(vc->domain->cpupool);
> +    cpumask_and(&cpus, &prv->cpus, online);
> +    cpumask_and(&cpus, &cpus, vc->cpu_hard_affinity);
> +
> +    cpu = cpumask_test_cpu(vc->processor, &cpus)
> +            ? vc->processor 
> +            : cpumask_cycle(vc->processor, &cpus);
> +    ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) );
> +
> +    return cpu;
> +}
> +
> +/*
> + * Burn budget at microsecond level. 
> + */
> +static void
> +burn_budgets(const struct scheduler *ops, struct rt_vcpu *svc, s_time_t now) 
> +{
> +    s_time_t delta;
> +    long count = 0;
> +
> +    /* don't burn budget for idle VCPU */
> +    if ( is_idle_vcpu(svc->vcpu) ) 
> +    {
> +        return;
> +    }
> +
> +    /* first time called for this svc, update last_start */
> +    if ( svc->last_start == 0 ) 
> +    {
> +        svc->last_start = now;
> +        return;
> +    }
> +
> +    /*
> +     * update deadline info: When deadline is in the past,
> +     * it need to be updated to the deadline of the current period,
> +     * and replenish the budget 
> +     */
> +    delta = now - svc->cur_deadline;
> +    if ( delta >= 0 ) 
> +    {
> +        count = ( delta/MICROSECS(svc->period) ) + 1;
> +        svc->cur_deadline += count * MICROSECS(svc->period);
> +        svc->cur_budget = svc->budget * 1000;
> +        return;
> +    }
> +
> +    /* burn at nanoseconds level */
> +    delta = now - svc->last_start;
> +    /* 
> +     * delta < 0 only happens in nested virtualization;
> +     * TODO: how should we handle delta < 0 in a better way? */
> +    if ( delta < 0 ) 
> +    {
> +        printk("%s, ATTENTION: now is behind last_start! delta = %ld for ",
> +                __func__, delta);
> +        rt_dump_vcpu(svc);
> +        svc->last_start = now;  /* update last_start */
> +        svc->cur_budget = 0;   /* FIXME: should we recover like this? */
> +        return;
> +    }
> +
> +    if ( svc->cur_budget == 0 ) 
> +        return;
> +
> +    svc->cur_budget -= delta;
> +    if ( svc->cur_budget < 0 ) 
> +        svc->cur_budget = 0;
> +}
> +
> +/* 
> + * RunQ is sorted. Pick first one within cpumask. If no one, return NULL
> + * lock is grabbed before calling this function 
> + */
> +static struct rt_vcpu *
> +__runq_pick(const struct scheduler *ops, cpumask_t mask)
> +{
> +    struct list_head *runq = RUNQ(ops);
> +    struct list_head *iter;
> +    struct rt_vcpu *svc = NULL;
> +    struct rt_vcpu *iter_svc = NULL;
> +    cpumask_t cpu_common;
> +    cpumask_t *online;
> +    struct rt_private * prv = RT_PRIV(ops);
> +
> +    list_for_each(iter, runq) 
> +    {
> +        iter_svc = __runq_elem(iter);
> +
> +        /* mask is intersection of cpu_hard_affinity and cpupool and 
> priv->cpus */
> +        online = cpupool_scheduler_cpumask(iter_svc->vcpu->domain->cpupool);
> +        cpumask_and(&cpu_common, online, &prv->cpus);
> +        cpumask_and(&cpu_common, &cpu_common, 
> iter_svc->vcpu->cpu_hard_affinity);
> +        cpumask_and(&cpu_common, &mask, &cpu_common);
> +        if ( cpumask_empty(&cpu_common) )
> +            continue;
> +
> +        if ( iter_svc->cur_budget <= 0 )
> +            continue;
> +
> +        svc = iter_svc;
> +        break;
> +    }
> +
> +    return svc;
> +}
> +
> +/*
> + * Update vcpu's budget and sort runq by insert the modifed vcpu back to runq
> + * lock is grabbed before calling this function 
> + */
> +static void
> +__repl_update(const struct scheduler *ops, s_time_t now)
> +{
> +    struct list_head *runq = RUNQ(ops);
> +    struct list_head *iter;
> +    struct list_head *tmp;
> +    struct rt_vcpu *svc = NULL;
> +
> +    s_time_t diff;
> +    long count;
> +
> +    list_for_each_safe(iter, tmp, runq) 
> +    {
> +        svc = __runq_elem(iter);
> +
> +        diff = now - svc->cur_deadline;
> +        if ( diff > 0 ) 
> +        {
> +            count = (diff/MICROSECS(svc->period)) + 1;
> +            svc->cur_deadline += count * MICROSECS(svc->period);
> +            svc->cur_budget = svc->budget * 1000;
> +            __runq_remove(svc);
> +            __runq_insert(ops, svc);
> +        }
> +    }
> +}
> +
> +/* 
> + * schedule function for rt scheduler.
> + * The lock is already grabbed in schedule.c, no need to lock here 
> + */
> +static struct task_slice
> +rt_schedule(const struct scheduler *ops, s_time_t now, bool_t 
> tasklet_work_scheduled)
> +{
> +    const int cpu = smp_processor_id();
> +    struct rt_private * prv = RT_PRIV(ops);
> +    struct rt_vcpu * const scurr = RT_VCPU(current);
> +    struct rt_vcpu * snext = NULL;
> +    struct task_slice ret = { .migrated = 0 };
> +
> +    /* clear ticked bit now that we've been scheduled */
> +    if ( cpumask_test_cpu(cpu, &prv->tickled) )
> +        cpumask_clear_cpu(cpu, &prv->tickled);
> +
> +    /* burn_budget would return for IDLE VCPU */
> +    burn_budgets(ops, scurr, now);
> +
> +    __repl_update(ops, now);
> +
> +    if ( tasklet_work_scheduled ) 
> +    {
> +        snext = RT_VCPU(idle_vcpu[cpu]);
> +    } 
> +    else 
> +    {
> +        cpumask_t cur_cpu;
> +        cpumask_clear(&cur_cpu);
> +        cpumask_set_cpu(cpu, &cur_cpu);
> +        snext = __runq_pick(ops, cur_cpu);
> +        if ( snext == NULL )
> +            snext = RT_VCPU(idle_vcpu[cpu]);
> +
> +        /* if scurr has higher priority and budget, still pick scurr */
> +        if ( !is_idle_vcpu(current) &&
> +             vcpu_runnable(current) &&
> +             scurr->cur_budget > 0 &&
> +             ( is_idle_vcpu(snext->vcpu) ||
> +               scurr->cur_deadline <= snext->cur_deadline ) ) 
> +            snext = scurr;
> +    }
> +
> +    if ( snext != scurr &&
> +         !is_idle_vcpu(current) &&
> +         vcpu_runnable(current) )
> +        set_bit(__RT_delayed_runq_add, &scurr->flags);
> +    
> +
> +    snext->last_start = now;
> +    if ( !is_idle_vcpu(snext->vcpu) ) 
> +    {
> +        if ( snext != scurr ) 
> +        {
> +            __runq_remove(snext);
> +            set_bit(__RT_scheduled, &snext->flags);
> +        }
> +        if ( snext->vcpu->processor != cpu ) 
> +        {
> +            snext->vcpu->processor = cpu;
> +            ret.migrated = 1;
> +        }
> +    }
> +
> +    ret.time = MILLISECS(1); /* sched quantum */
> +    ret.task = snext->vcpu;
> +
> +    return ret;
> +}
> +
> +/*
> + * Remove VCPU from RunQ
> + * The lock is already grabbed in schedule.c, no need to lock here 
> + */
> +static void
> +rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
> +{
> +    struct rt_vcpu * const svc = RT_VCPU(vc);
> +
> +    BUG_ON( is_idle_vcpu(vc) );
> +
> +    if ( curr_on_cpu(vc->processor) == vc ) 
> +    {
> +        cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
> +        return;
> +    }
> +
> +    if ( __vcpu_on_runq(svc) ) 
> +    {
> +        __runq_remove(svc);
> +        printk("%s: vcpu should not on runq in vcpu_sleep()\n", 
> __FUNCTION__);
> +        BUG();
> +    }
> +
> +    clear_bit(__RT_delayed_runq_add, &svc->flags);
> +}
> +
> +/*
> + * Pick a vcpu on a cpu to kick out to place the running candidate
> + * Called by wake() and context_saved()
> + * We have a running candidate here, the kick logic is:
> + * Among all the cpus that are within the cpu affinity
> + * 1) if the new->cpu is idle, kick it. This could benefit cache hit
> + * 2) if there are any idle vcpu, kick it.
> + * 3) now all pcpus are busy, among all the running vcpus, pick lowest 
> priority one
> + *    if snext has higher priority, kick it.
> + *
> + * TODO:
> + * 1) what if these two vcpus belongs to the same domain?
> + *    replace a vcpu belonging to the same domain introduces more overhead
> + *
> + * lock is grabbed before calling this function 
> + */
> +static void
> +runq_tickle(const struct scheduler *ops, struct rt_vcpu *new)
> +{
> +    struct rt_private * prv = RT_PRIV(ops);
> +    struct rt_vcpu * latest_deadline_vcpu = NULL;    /* lowest priority 
> scheduled */
> +    struct rt_vcpu * iter_svc;
> +    struct vcpu * iter_vc;
> +    int cpu = 0;
> +    cpumask_t not_tickled;
> +    cpumask_t *online;
> +
> +    if ( new == NULL || is_idle_vcpu(new->vcpu) ) 
> +        return;
> +
> +    online = cpupool_scheduler_cpumask(new->vcpu->domain->cpupool);
> +    cpumask_and(&not_tickled, online, &prv->cpus);
> +    cpumask_and(&not_tickled, &not_tickled, new->vcpu->cpu_hard_affinity);
> +    cpumask_andnot(&not_tickled, &not_tickled, &prv->tickled);
> +
> +    /* 1) if new's previous cpu is idle, kick it for cache benefit */
> +    if ( is_idle_vcpu(curr_on_cpu(new->vcpu->processor)) ) 
> +    {
> +        cpumask_set_cpu(new->vcpu->processor, &prv->tickled);
> +        cpu_raise_softirq(new->vcpu->processor, SCHEDULE_SOFTIRQ);
> +        return;
> +    }
> +
> +    /* 2) if there are any idle pcpu, kick it */
> +    /* The same loop also find the one with lowest priority */
> +    for_each_cpu(cpu, &not_tickled) 
> +    {
> +        iter_vc = curr_on_cpu(cpu);
> +        if ( is_idle_vcpu(iter_vc) ) 
> +        {
> +            cpumask_set_cpu(cpu, &prv->tickled);
> +            cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
> +            return;
> +        }
> +        iter_svc = RT_VCPU(iter_vc);
> +        if ( latest_deadline_vcpu == NULL || 
> +             iter_svc->cur_deadline > latest_deadline_vcpu->cur_deadline )
> +            latest_deadline_vcpu = iter_svc;
> +    }
> +
> +    /* 3) candicate has higher priority, kick out the lowest priority vcpu */
> +    if ( latest_deadline_vcpu != NULL && new->cur_deadline < 
> latest_deadline_vcpu->cur_deadline ) 
> +    {
> +        cpumask_set_cpu(latest_deadline_vcpu->vcpu->processor, 
> &prv->tickled);
> +        cpu_raise_softirq(latest_deadline_vcpu->vcpu->processor, 
> SCHEDULE_SOFTIRQ);
> +    }
> +    return;
> +}
> +
> +/* 
> + * Should always wake up runnable vcpu, put it back to RunQ. 
> + * Check priority to raise interrupt 
> + * The lock is already grabbed in schedule.c, no need to lock here 
> + * TODO: what if these two vcpus belongs to the same domain? 
> + */
> +static void
> +rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
> +{
> +    struct rt_vcpu * const svc = RT_VCPU(vc);
> +    s_time_t diff;
> +    s_time_t now = NOW();
> +    long count = 0;
> +    struct rt_private * prv = RT_PRIV(ops);
> +    struct rt_vcpu * snext = NULL;        /* highest priority on RunQ */
> +
> +    BUG_ON( is_idle_vcpu(vc) );
> +
> +    if ( unlikely(curr_on_cpu(vc->processor) == vc) ) 
> +        return;
> +
> +    /* on RunQ, just update info is ok */
> +    if ( unlikely(__vcpu_on_runq(svc)) ) 
> +        return;
> +
> +    /* If context hasn't been saved for this vcpu yet, we can't put it on
> +     * the Runqueue. Instead, we set a flag so that it will be put on the 
> Runqueue
> +     * After the context has been saved. */
> +    if ( unlikely(test_bit(__RT_scheduled, &svc->flags)) ) 
> +    {
> +        set_bit(__RT_delayed_runq_add, &svc->flags);
> +        return;
> +    }
> +
> +    /* update deadline info */
> +    diff = now - svc->cur_deadline;
> +    if ( diff >= 0 ) 
> +    {
> +        count = ( diff/MICROSECS(svc->period) ) + 1;
> +        svc->cur_deadline += count * MICROSECS(svc->period);
> +        svc->cur_budget = svc->budget * 1000;
> +    }
> +
> +    __runq_insert(ops, svc);
> +    __repl_update(ops, now);
> +    snext = __runq_pick(ops, prv->cpus);    /* pick snext from ALL valid 
> cpus */
> +    runq_tickle(ops, snext);
> +
> +    return;
> +}
> +
> +/* 
> + * scurr has finished context switch, insert it back to the RunQ,
> + * and then pick the highest priority vcpu from runq to run 
> + */
> +static void
> +rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
> +{
> +    struct rt_vcpu * svc = RT_VCPU(vc);
> +    struct rt_vcpu * snext = NULL;
> +    struct rt_private * prv = RT_PRIV(ops);
> +    spinlock_t *lock = vcpu_schedule_lock_irq(vc);
> +
> +    clear_bit(__RT_scheduled, &svc->flags);
> +    /* not insert idle vcpu to runq */
> +    if ( is_idle_vcpu(vc) ) 
> +        goto out;
> +
> +    if ( test_and_clear_bit(__RT_delayed_runq_add, &svc->flags) && 
> +         likely(vcpu_runnable(vc)) ) 
> +    {
> +        __runq_insert(ops, svc);
> +        __repl_update(ops, NOW());
> +        snext = __runq_pick(ops, prv->cpus);    /* pick snext from ALL cpus 
> */
> +        runq_tickle(ops, snext);
> +    }
> +out:
> +    vcpu_schedule_unlock_irq(lock, vc);
> +}
> +
> +/*
> + * set/get each vcpu info of each domain
> + */
> +static int
> +rt_dom_cntl(
> +    const struct scheduler *ops, 
> +    struct domain *d, 
> +    struct xen_domctl_scheduler_op *op)
> +{
> +    xen_domctl_sched_rt_params_t *local_sched;
> +    struct rt_dom * const sdom = RT_DOM(d);
> +    struct list_head *iter;
> +    int vcpu_index = 0;
> +    int rc = -EINVAL;
> +
> +    switch ( op->cmd )
> +    {
> +    case XEN_DOMCTL_SCHEDOP_getnumvcpus:
> +        op->u.rt.nr_vcpus = 0;
> +        list_for_each( iter, &sdom->vcpu ) 
> +            vcpu_index++;
> +        op->u.rt.nr_vcpus = vcpu_index;
> +        rc = 0;
> +        break;
> +    case XEN_DOMCTL_SCHEDOP_getinfo:
> +        /* for debug use, whenever adjust Dom0 parameter, do global dump */
> +        if ( d->domain_id == 0 ) 
> +            rt_dump(ops);
> +
> +        op->u.rt.nr_vcpus = 0;
> +        list_for_each( iter, &sdom->vcpu ) 
> +            vcpu_index++;
> +        op->u.rt.nr_vcpus = vcpu_index;
> +        local_sched = xzalloc_array(xen_domctl_sched_rt_params_t, 
> vcpu_index);
> +        vcpu_index = 0;
> +        list_for_each( iter, &sdom->vcpu ) 
> +        {
> +            struct rt_vcpu * svc = list_entry(iter, struct rt_vcpu, 
> sdom_elem);
> +
> +            local_sched[vcpu_index].budget = svc->budget;
> +            local_sched[vcpu_index].period = svc->period;
> +            local_sched[vcpu_index].index = vcpu_index;
> +            vcpu_index++;
> +        }
> +        copy_to_guest(op->u.rt.vcpu, local_sched, vcpu_index);

This will clobber guest heap if vcpu_index is greater than the allocated
space.  You also unconditionally leak local_sched, but there is no need
for an allocation in the first place.

> +        rc = 0;
> +        break;
> +    case XEN_DOMCTL_SCHEDOP_putinfo:
> +        list_for_each( iter, &sdom->vcpu ) 
> +        {
> +            struct rt_vcpu * svc = list_entry(iter, struct rt_vcpu, 
> sdom_elem);
> +
> +            /* adjust per VCPU parameter */
> +            if ( op->u.rt.vcpu_index == svc->vcpu->vcpu_id ) 
> +            { 
> +                vcpu_index = op->u.rt.vcpu_index;
> +
> +                if ( vcpu_index < 0 ) 
> +                    printk("XEN_DOMCTL_SCHEDOP_putinfo: vcpu_index=%d\n",
> +                            vcpu_index);
> +                else
> +                    printk("XEN_DOMCTL_SCHEDOP_putinfo: "
> +                            "vcpu_index=%d, period=%"PRId64", 
> budget=%"PRId64"\n",
> +                            vcpu_index, op->u.rt.period, op->u.rt.budget);
> +
> +                svc->period = op->u.rt.period;
> +                svc->budget = op->u.rt.budget;
> +
> +                break;
> +            }
> +        }
> +        rc = 0;
> +        break;
> +    }
> +
> +    return rc;
> +}
> +
> +static struct rt_private _rt_priv;
> +
> +const struct scheduler sched_rt_def = {
> +    .name           = "SMP RT Scheduler",
> +    .opt_name       = "rt",

Should these names reflect RT_DS as opposed to simply RT?

> +    .sched_id       = XEN_SCHEDULER_RT_DS,
> +    .sched_data     = &_rt_priv,
> +
> +    .dump_cpu_state = rt_dump_pcpu,
> +    .dump_settings  = rt_dump,
> +    .init           = rt_init,
> +    .deinit         = rt_deinit,
> +    .alloc_pdata    = rt_alloc_pdata,
> +    .free_pdata     = rt_free_pdata,
> +    .alloc_domdata  = rt_alloc_domdata,
> +    .free_domdata   = rt_free_domdata,
> +    .init_domain    = rt_dom_init,
> +    .destroy_domain = rt_dom_destroy,
> +    .alloc_vdata    = rt_alloc_vdata,
> +    .free_vdata     = rt_free_vdata,
> +    .insert_vcpu    = rt_vcpu_insert,
> +    .remove_vcpu    = rt_vcpu_remove,
> +
> +    .adjust         = rt_dom_cntl,
> +
> +    .pick_cpu       = rt_cpu_pick,
> +    .do_schedule    = rt_schedule,
> +    .sleep          = rt_vcpu_sleep,
> +    .wake           = rt_vcpu_wake,
> +    .context_saved  = rt_context_saved,
> +};
> diff --git a/xen/common/schedule.c b/xen/common/schedule.c
> index e9eb0bc..f2df400 100644
> --- a/xen/common/schedule.c
> +++ b/xen/common/schedule.c
> @@ -68,6 +68,7 @@ static const struct scheduler *schedulers[] = {
>      &sched_sedf_def,
>      &sched_credit_def,
>      &sched_credit2_def,
> +    &sched_rt_def,
>      &sched_arinc653_def,
>  };
>  
> @@ -1092,7 +1093,8 @@ long sched_adjust(struct domain *d, struct 
> xen_domctl_scheduler_op *op)
>  
>      if ( (op->sched_id != DOM2OP(d)->sched_id) ||
>           ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
> -          (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
> +          (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo) &&
> +          (op->cmd != XEN_DOMCTL_SCHEDOP_getnumvcpus)) )
>          return -EINVAL;
>  
>      /* NB: the pluggable scheduler code needs to take care
> diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
> index 5b11bbf..8d4b973 100644
> --- a/xen/include/public/domctl.h
> +++ b/xen/include/public/domctl.h
> @@ -339,6 +339,18 @@ struct xen_domctl_max_vcpus {
>  typedef struct xen_domctl_max_vcpus xen_domctl_max_vcpus_t;
>  DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t);
>  
> +/*
> + * This structure is used to pass to rt scheduler from a 
> + * privileged domain to Xen
> + */
> +struct xen_domctl_sched_rt_params {
> +    /* get vcpus' info */
> +    int64_t period; /* s_time_t type */
> +    int64_t budget;
> +    int     index;

Index is clearly an unsigned quantity.  For alignment and compatibility,
uint64_t would make the most sense.  Alternatively, uint32_t and an
explicit uint32_t pad field.

~Andrew

> +};
> +typedef struct xen_domctl_sched_rt_params xen_domctl_sched_rt_params_t;
> +DEFINE_XEN_GUEST_HANDLE(xen_domctl_sched_rt_params_t);
>  
>  /* XEN_DOMCTL_scheduler_op */
>  /* Scheduler types. */
> @@ -346,9 +358,12 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t);
>  #define XEN_SCHEDULER_CREDIT   5
>  #define XEN_SCHEDULER_CREDIT2  6
>  #define XEN_SCHEDULER_ARINC653 7
> +#define XEN_SCHEDULER_RT_DS    8
> +
>  /* Set or get info? */
> -#define XEN_DOMCTL_SCHEDOP_putinfo 0
> -#define XEN_DOMCTL_SCHEDOP_getinfo 1
> +#define XEN_DOMCTL_SCHEDOP_putinfo      0
> +#define XEN_DOMCTL_SCHEDOP_getinfo      1
> +#define XEN_DOMCTL_SCHEDOP_getnumvcpus   2
>  struct xen_domctl_scheduler_op {
>      uint32_t sched_id;  /* XEN_SCHEDULER_* */
>      uint32_t cmd;       /* XEN_DOMCTL_SCHEDOP_* */
> @@ -367,6 +382,15 @@ struct xen_domctl_scheduler_op {
>          struct xen_domctl_sched_credit2 {
>              uint16_t weight;
>          } credit2;
> +        struct xen_domctl_sched_rt{
> +            /* get vcpus' params */
> +            XEN_GUEST_HANDLE_64(xen_domctl_sched_rt_params_t) vcpu;
> +            uint16_t nr_vcpus;
> +            /* set one vcpu's params */
> +            uint16_t vcpu_index;
> +            int64_t period;
> +            int64_t budget;
> +        } rt;
>      } u;
>  };
>  typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t;
> diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h
> index 4164dff..bcbe234 100644
> --- a/xen/include/xen/sched-if.h
> +++ b/xen/include/xen/sched-if.h
> @@ -169,6 +169,7 @@ extern const struct scheduler sched_sedf_def;
>  extern const struct scheduler sched_credit_def;
>  extern const struct scheduler sched_credit2_def;
>  extern const struct scheduler sched_arinc653_def;
> +extern const struct scheduler sched_rt_def;
>  
>  
>  struct cpupool


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.