Xen project Mailing List

Re: [Xen-devel] [PATCH RFC v2 1/4] xen: add real time scheduler rt

On 29/07/14 02:52, Meng Xu wrote: > This scheduler follows the pre-emptive Global EDF theory in real-time field. > Each VCPU can have a dedicated period and budget. > While scheduled, a VCPU burns its budget. > A VCPU has its budget replenished at the beginning of each of its periods; > The VCPU discards its unused budget at the end of each of its periods. > If a VCPU runs out of budget in a period, it has to wait until next period. > The mechanism of how to burn a VCPU's budget depends on the server mechanism > implemented for each VCPU. > > Server mechanism: a VCPU is implemented as a deferable server. > When a VCPU is scheduled to execute on a PCPU, its budget is continuously > burned. > > Priority scheme: Preemptive Global Earliest Deadline First (gEDF). > At any scheduling point, the VCPU with earliest deadline has highest > priority. > > Queue scheme: A global Runqueue for each CPU pool. > The Runqueue holds all runnable VCPUs. > VCPUs in the Runqueue are divided into two parts: with and without budget. > At each part, VCPUs are sorted based on gEDF priority scheme. > > Scheduling quantum: 1 ms; > > Note: cpumask and cpupool is supported. > > This is still in the development phase. > > Signed-off-by: Sisu Xi <xisisu@xxxxxxxxx> > Signed-off-by: Meng Xu <mengxu@xxxxxxxxxxxxx> > --- > xen/common/Makefile | 1 + > xen/common/sched_rt.c | 1058 > +++++++++++++++++++++++++++++++++++++++++++ > xen/common/schedule.c | 4 +- > xen/include/public/domctl.h | 28 +- > xen/include/xen/sched-if.h | 1 + > 5 files changed, 1089 insertions(+), 3 deletions(-) > create mode 100644 xen/common/sched_rt.c > > diff --git a/xen/common/Makefile b/xen/common/Makefile > index 3683ae3..5a23aa4 100644 > --- a/xen/common/Makefile > +++ b/xen/common/Makefile > @@ -26,6 +26,7 @@ obj-y += sched_credit.o > obj-y += sched_credit2.o > obj-y += sched_sedf.o > obj-y += sched_arinc653.o > +obj-y += sched_rt.o > obj-y += schedule.o > obj-y += shutdown.o > obj-y += softirq.o > diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c > new file mode 100644 > index 0000000..6cfbb8a > --- /dev/null > +++ b/xen/common/sched_rt.c > @@ -0,0 +1,1058 @@ > +/****************************************************************************** > + * Preemptive Global Earliest Deadline First (EDF) scheduler for Xen > + * EDF scheduling is one of most popular real-time scheduling algorithm used > in > + * embedded field. > + * > + * by Sisu Xi, 2013, Washington University in Saint Louis > + * and Meng Xu, 2014, University of Pennsylvania > + * > + * based on the code of credit Scheduler > + */ > + > +#include <xen/config.h> > +#include <xen/init.h> > +#include <xen/lib.h> > +#include <xen/sched.h> > +#include <xen/domain.h> > +#include <xen/delay.h> > +#include <xen/event.h> > +#include <xen/time.h> > +#include <xen/perfc.h> > +#include <xen/sched-if.h> > +#include <xen/softirq.h> > +#include <asm/atomic.h> > +#include <xen/errno.h> > +#include <xen/trace.h> > +#include <xen/cpu.h> > +#include <xen/keyhandler.h> > +#include <xen/trace.h> > +#include <xen/guest_access.h> > + > +/* > + * TODO: > + * > + * Migration compensation and resist like credit2 to better use cache; > + * Lock Holder Problem, using yield? > + * Self switch problem: VCPUs of the same domain may preempt each other; > + */ > + > +/* > + * Design: > + * > + * This scheduler follows the Preemptive Global EDF theory in real-time > field. > + * Each VCPU can have a dedicated period and budget. > + * While scheduled, a VCPU burns its budget. > + * A VCPU has its budget replenished at the beginning of each of its periods; > + * The VCPU discards its unused budget at the end of each of its periods. > + * If a VCPU runs out of budget in a period, it has to wait until next > period. > + * The mechanism of how to burn a VCPU's budget depends on the server > mechanism > + * implemented for each VCPU. > + * > + * Server mechanism: a VCPU is implemented as a deferable server. > + * When a VCPU has a task running on it, its budget is continuously burned; > + * When a VCPU has no task but with budget left, its budget is preserved. > + * > + * Priority scheme: Preemptive Global Earliest Deadline First (gEDF). > + * At any scheduling point, the VCPU with earliest deadline has highest > priority. > + * > + * Queue scheme: A global runqueue for each CPU pool. > + * The runqueue holds all runnable VCPUs. > + * VCPUs in the runqueue are divided into two parts: with and without > remaining budget. > + * At each part, VCPUs are sorted based on EDF priority scheme. > + * > + * Scheduling quanta: 1 ms; but accounting the budget is in microsecond. > + * > + * Note: cpumask and cpupool is supported. > + */ > + > +/* > + * Locking: > + * Just like credit2, a global system lock is used to protect the RunQ. > + * The global lock is referenced by schedule_data.schedule_lock from all > physical cpus. > + * > + * The lock is already grabbed when calling wake/sleep/schedule/ functions > in schedule.c > + * > + * The functions involes RunQ and needs to grab locks are: > + * dump, vcpu_insert, vcpu_remove, context_saved, > + */ > + > + > +/* > + * Default parameters: Period and budget in default is 10 and 4 ms, > respectively > + */ > +#define RT_DEFAULT_PERIOD (MICROSECS(10)) > +#define RT_DEFAULT_BUDGET (MICROSECS(4)) > + > +/* > + * Useful macros > + */ > +#define RT_PRIV(_ops) ((struct rt_private *)((_ops)->sched_data)) > +#define RT_VCPU(_vcpu) ((struct rt_vcpu *)(_vcpu)->sched_priv) > +#define RT_DOM(_dom) ((struct rt_dom *)(_dom)->sched_priv) > +#define RUNQ(_ops) (&RT_PRIV(_ops)->runq) I know you are copying the prevailing style, but these macros are nasty. They would be perfectly fine as static inlines with some real types... static inline struct rt_private *RT_PRIV(const scheduler *ops) { return ops->sched_data; } ... which allow for rather more useful compiler errors in the case that they get accidentally misused. > + > +/* > + * Flags > + */ > +/* RT_scheduled: Is this vcpu either running on, or context-switching off, > + * a phyiscal cpu? > + * + Accessed only with Runqueue lock held. > + * + Set when chosen as next in rt_schedule(). > + * + Cleared after context switch has been saved in rt_context_saved() > + * + Checked in vcpu_wake to see if we can add to the Runqueue, or if we > should > + * set RT_delayed_runq_add > + * + Checked to be false in runq_insert. > + */ > +#define __RT_scheduled 1 > +#define RT_scheduled (1<<__RT_scheduled) > +/* RT_delayed_runq_add: Do we need to add this to the Runqueueu once it'd > done > + * being context switching out? > + * + Set when scheduling out in rt_schedule() if prev is runable > + * + Set in rt_vcpu_wake if it finds RT_scheduled set > + * + Read in rt_context_saved(). If set, it adds prev to the Runqueue and > + * clears the bit. > + * > + */ > +#define __RT_delayed_runq_add 2 > +#define RT_delayed_runq_add (1<<__RT_delayed_runq_add) > + > +/* > + * Debug only. Used to printout debug information > + */ > +#define printtime()\ > + ({s_time_t now = NOW(); \ > + printk("%u : %3ld.%3ldus : %-19s ",\ > + smp_processor_id(), now/MICROSECS(1), now%MICROSECS(1)/1000, > __func__);} ) > + > +/* > + * Systme-wide private data, include a global RunQueue > + * The global lock is referenced by schedule_data.schedule_lock from all > physical cpus. > + * It can be grabbed via vcpu_schedule_lock_irq() > + */ > +struct rt_private { > + spinlock_t lock; /* The global coarse grand lock */ > + struct list_head sdom; /* list of availalbe domains, used for dump */ > + struct list_head runq; /* Ordered list of runnable VMs */ > + cpumask_t cpus; /* cpumask_t of available physical cpus */ > + cpumask_t tickled; /* another cpu in the queue already ticked this > one */ > +}; > + > +/* > + * Virtual CPU > + */ > +struct rt_vcpu { > + struct list_head runq_elem; /* On the runqueue list */ > + struct list_head sdom_elem; /* On the domain VCPU list */ > + > + /* Up-pointers */ > + struct rt_dom *sdom; > + struct vcpu *vcpu; > + > + /* VCPU parameters, in milliseconds */ > + s_time_t period; > + s_time_t budget; > + > + /* VCPU current infomation in nanosecond */ > + long cur_budget; /* current budget */ > + s_time_t last_start; /* last start time */ > + s_time_t cur_deadline; /* current deadline for EDF */ > + > + unsigned flags; /* mark __RT_scheduled, etc.. */ > +}; > + > +/* > + * Domain > + */ > +struct rt_dom { > + struct list_head vcpu; /* link its VCPUs */ > + struct list_head sdom_elem; /* link list on rt_priv */ > + struct domain *dom; /* pointer to upper domain */ > +}; > + > +/* > + * RunQueue helper functions > + */ > +static int > +__vcpu_on_runq(struct rt_vcpu *svc) > +{ > + return !list_empty(&svc->runq_elem); > +} > + > +static struct rt_vcpu * > +__runq_elem(struct list_head *elem) > +{ > + return list_entry(elem, struct rt_vcpu, runq_elem); > +} > + > +/* > + * Debug related code, dump vcpu/cpu information > + */ > +static void > +rt_dump_vcpu(struct rt_vcpu *svc) const struct rc_vcpu *svc, for added safety. (In the past we have had a dump function which accidentally clobbered some of the state it was supposed to be reading) > +{ > + char *cpustr = keyhandler_scratch; Xen style - newline between declarations and code. The keyhandler_scratch is only safe to use in keyhandlers, yet your dump functions are based on scheduling operations. You risk concurrent access with other dump functions and with keyhandlers. > + if ( svc == NULL ) > + { > + printk("NULL!\n"); > + return; > + } This is quite a useless message on its own. Is it reasonable for svc to ever be NULL here? > + cpumask_scnprintf(cpustr, sizeof(cpustr), svc->vcpu->cpu_hard_affinity); Buggy sizeof. sizeof(cpustr) is 4, where I suspect you mean sizeof(keyscratch_handler) which is 1024. > + printk("[%5d.%-2d] cpu %d, (%"PRId64", %"PRId64"), cur_b=%"PRId64" > cur_d=%"PRId64" last_start=%"PRId64" onR=%d runnable=%d cpu_hard_affinity=%s > ", > + svc->vcpu->domain->domain_id, > + svc->vcpu->vcpu_id, > + svc->vcpu->processor, vcpu_id and processor are both unsigned quantities, so should be formatted with %u rather thatn %d > + svc->period, > + svc->budget, > + svc->cur_budget, > + svc->cur_deadline, > + svc->last_start, > + __vcpu_on_runq(svc), > + vcpu_runnable(svc->vcpu), > + cpustr); > + memset(cpustr, 0, sizeof(char)*1024); This memset is quite pointless, and buggy if keyscratch handler changes length. cpumask_scnprintf() properly NULL terminates its string. > + cpumask_scnprintf(cpustr, sizeof(cpustr), > cpupool_scheduler_cpumask(svc->vcpu->domain->cpupool)); Same buggy sizeof. > + printk("cpupool=%s\n", cpustr); > +} > + > +static void > +rt_dump_pcpu(const struct scheduler *ops, int cpu) > +{ > + struct rt_vcpu *svc = RT_VCPU(curr_on_cpu(cpu)); > + > + printtime(); > + rt_dump_vcpu(svc); > +} > + > +/* > + * should not need lock here. only showing stuff > + */ > +static void > +rt_dump(const struct scheduler *ops) > +{ > + struct list_head *iter_sdom, *iter_svc, *runq, *iter; > + struct rt_private *prv = RT_PRIV(ops); > + struct rt_vcpu *svc; > + int cpu = 0; > + int loop = 0; Both of these should be unsigned. > + > + printtime(); > + printk("Priority Scheme: EDF\n"); > + > + printk("PCPU info: \n"); Please remove these trailing spaces before newlines. All they do is waste space in the console ring. > + for_each_cpu(cpu, &prv->cpus) > + rt_dump_pcpu(ops, cpu); > + > + printk("Global RunQueue info: \n"); > + loop = 0; > + runq = RUNQ(ops); > + list_for_each( iter, runq ) > + { > + svc = __runq_elem(iter); > + printk("\t%3d: ", ++loop); > + rt_dump_vcpu(svc); > + } > + > + printk("Domain info: \n"); > + loop = 0; > + list_for_each( iter_sdom, &prv->sdom ) > + { > + struct rt_dom *sdom; > + sdom = list_entry(iter_sdom, struct rt_dom, sdom_elem); > + printk("\tdomain: %d\n", sdom->dom->domain_id); > + > + list_for_each( iter_svc, &sdom->vcpu ) > + { > + svc = list_entry(iter_svc, struct rt_vcpu, sdom_elem); > + printk("\t\t%3d: ", ++loop); > + rt_dump_vcpu(svc); > + } > + } > + > + printk("\n"); > +} > + > +static inline void > +__runq_remove(struct rt_vcpu *svc) > +{ > + if ( __vcpu_on_runq(svc) ) > + list_del_init(&svc->runq_elem); > +} > + > +/* > + * Insert a vcpu in the RunQ based on vcpus' deadline: > + * EDF schedule policy: vcpu with smaller deadline has higher priority; > + * The vcpu svc to be inserted will be inserted just before the very first > + * vcpu iter_svc in the Runqueue whose deadline is equal or larger than > + * svc's deadline. > + */ > +static void > +__runq_insert(const struct scheduler *ops, struct rt_vcpu *svc) > +{ > + struct list_head *runq = RUNQ(ops); > + struct list_head *iter; > + spinlock_t *schedule_lock; > + > + schedule_lock = per_cpu(schedule_data, > svc->vcpu->processor).schedule_lock; > + ASSERT( spin_is_locked(schedule_lock) ); > + > + /* Debug only */ > + if ( __vcpu_on_runq(svc) ) > + { > + rt_dump(ops); > + } > + ASSERT( !__vcpu_on_runq(svc) ); > + > + list_for_each(iter, runq) > + { > + struct rt_vcpu * iter_svc = __runq_elem(iter); > + > + /* svc still has budget */ > + if ( svc->cur_budget > 0 ) > + { > + if ( iter_svc->cur_budget == 0 || > + svc->cur_deadline <= iter_svc->cur_deadline ) > + break; > + } > + else > + { /* svc has no budget */ > + if ( iter_svc->cur_budget == 0 && > + svc->cur_deadline <= iter_svc->cur_deadline ) > + break; > + } > + } > + > + list_add_tail(&svc->runq_elem, iter); > +} > + > +/* > + * Init/Free related code > + */ > +static int > +rt_init(struct scheduler *ops) > +{ > + struct rt_private *prv = xzalloc(struct rt_private); > + > + if ( prv == NULL ) > + return -ENOMEM; Newline in here. > + ops->sched_data = prv; Is it safe to set ops->sched_data with a half constructed rt_private? I suspect this wants to be the very last (non-debug) action in this function. > + spin_lock_init(&prv->lock); > + INIT_LIST_HEAD(&prv->sdom); > + INIT_LIST_HEAD(&prv->runq); > + > + printtime(); > + printk("\n"); > + > + return 0; > +} > + > +static void > +rt_deinit(const struct scheduler *ops) > +{ > + struct rt_private *prv = RT_PRIV(ops); > + > + printtime(); > + printk("\n"); > + xfree(prv); > +} > + > +/* > + * point per_cpu spinlock to the global system lock; all cpu have same > global system lock > + */ > +static void * > +rt_alloc_pdata(const struct scheduler *ops, int cpu) > +{ > + struct rt_private *prv = RT_PRIV(ops); > + > + cpumask_set_cpu(cpu, &prv->cpus); > + > + per_cpu(schedule_data, cpu).schedule_lock = &prv->lock; > + > + printtime(); > + printk("%s total cpus: %d", __FUNCTION__, cpumask_weight(&prv->cpus)); __FUNCTION__ is a gccism. __func__ is a standard way of doing the same. > + /* same as credit2, not a bogus pointer */ > + return (void *)1; > +} > + > +static void > +rt_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) > +{ > + struct rt_private * prv = RT_PRIV(ops); > + cpumask_clear_cpu(cpu, &prv->cpus); > + printtime(); > + printk("%s cpu=%d\n", __FUNCTION__, cpu); > +} > + > +static void * > +rt_alloc_domdata(const struct scheduler *ops, struct domain *dom) > +{ > + unsigned long flags; > + struct rt_dom *sdom; > + struct rt_private * prv = RT_PRIV(ops); > + > + printtime(); > + printk("dom=%d\n", dom->domain_id); > + > + sdom = xzalloc(struct rt_dom); > + if ( sdom == NULL ) > + { > + printk("%s, xzalloc failed\n", __func__); > + return NULL; > + } > + > + INIT_LIST_HEAD(&sdom->vcpu); > + INIT_LIST_HEAD(&sdom->sdom_elem); > + sdom->dom = dom; > + > + /* spinlock here to insert the dom */ > + spin_lock_irqsave(&prv->lock, flags); > + list_add_tail(&sdom->sdom_elem, &(prv->sdom)); > + spin_unlock_irqrestore(&prv->lock, flags); > + > + return (void *)sdom; Bogus cast. > +} > + > +static void > +rt_free_domdata(const struct scheduler *ops, void *data) > +{ > + unsigned long flags; > + struct rt_dom *sdom = data; > + struct rt_private *prv = RT_PRIV(ops); > + > + printtime(); > + printk("dom=%d\n", sdom->dom->domain_id); > + > + spin_lock_irqsave(&prv->lock, flags); > + list_del_init(&sdom->sdom_elem); > + spin_unlock_irqrestore(&prv->lock, flags); > + xfree(data); > +} > + > +static int > +rt_dom_init(const struct scheduler *ops, struct domain *dom) > +{ > + struct rt_dom *sdom; > + > + printtime(); > + printk("dom=%d\n", dom->domain_id); > + > + /* IDLE Domain does not link on rt_private */ > + if ( is_idle_domain(dom) ) > + return 0; > + > + sdom = rt_alloc_domdata(ops, dom); > + if ( sdom == NULL ) > + { > + printk("%s, failed\n", __func__); > + return -ENOMEM; > + } > + dom->sched_priv = sdom; > + > + return 0; > +} > + > +static void > +rt_dom_destroy(const struct scheduler *ops, struct domain *dom) > +{ > + printtime(); > + printk("dom=%d\n", dom->domain_id); > + > + rt_free_domdata(ops, RT_DOM(dom)); > +} > + > +static void * > +rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd) > +{ > + struct rt_vcpu *svc; > + s_time_t now = NOW(); > + long count; > + > + /* Allocate per-VCPU info */ > + svc = xzalloc(struct rt_vcpu); > + if ( svc == NULL ) > + { > + printk("%s, xzalloc failed\n", __func__); > + return NULL; > + } > + > + INIT_LIST_HEAD(&svc->runq_elem); > + INIT_LIST_HEAD(&svc->sdom_elem); > + svc->flags = 0U; > + svc->sdom = dd; > + svc->vcpu = vc; > + svc->last_start = 0; /* init last_start is 0 */ > + > + svc->period = RT_DEFAULT_PERIOD; > + if ( !is_idle_vcpu(vc) ) > + svc->budget = RT_DEFAULT_BUDGET; > + > + count = (now/MICROSECS(svc->period)) + 1; > + /* sync all VCPU's start time to 0 */ > + svc->cur_deadline += count * MICROSECS(svc->period); > + > + svc->cur_budget = svc->budget*1000; /* counting in microseconds level */ > + /* Debug only: dump new vcpu's info */ > + printtime(); > + rt_dump_vcpu(svc); > + > + return svc; > +} > + > +static void > +rt_free_vdata(const struct scheduler *ops, void *priv) > +{ > + struct rt_vcpu *svc = priv; > + > + /* Debug only: dump freed vcpu's info */ > + printtime(); > + rt_dump_vcpu(svc); > + xfree(svc); > +} > + > +/* > + * TODO: Do we need to add vc to the new Runqueue? > + * This function is called in sched_move_domain() in schedule.c > + * When move a domain to a new cpupool, > + * may have to add vc to the Runqueue of the new cpupool > + */ > +static void > +rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) > +{ > + struct rt_vcpu *svc = RT_VCPU(vc); > + > + /* Debug only: dump info of vcpu to insert */ > + printtime(); > + rt_dump_vcpu(svc); > + > + /* not addlocate idle vcpu to dom vcpu list */ > + if ( is_idle_vcpu(vc) ) > + return; > + > + list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu); /* add to dom vcpu > list */ > +} > + > +/* > + * TODO: same as rt_vcpu_insert() > + */ > +static void > +rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc) > +{ > + struct rt_vcpu * const svc = RT_VCPU(vc); > + struct rt_dom * const sdom = svc->sdom; > + > + printtime(); > + rt_dump_vcpu(svc); > + > + BUG_ON( sdom == NULL ); > + BUG_ON( __vcpu_on_runq(svc) ); > + > + if ( !is_idle_vcpu(vc) ) > + list_del_init(&svc->sdom_elem); > +} > + > +/* > + * Pick a valid CPU for the vcpu vc > + * Valid CPU of a vcpu is intesection of vcpu's affinity and available cpus > + */ > +static int > +rt_cpu_pick(const struct scheduler *ops, struct vcpu *vc) > +{ > + cpumask_t cpus; > + cpumask_t *online; > + int cpu; > + struct rt_private * prv = RT_PRIV(ops); > + > + online = cpupool_scheduler_cpumask(vc->domain->cpupool); > + cpumask_and(&cpus, &prv->cpus, online); > + cpumask_and(&cpus, &cpus, vc->cpu_hard_affinity); > + > + cpu = cpumask_test_cpu(vc->processor, &cpus) > + ? vc->processor > + : cpumask_cycle(vc->processor, &cpus); > + ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) ); > + > + return cpu; > +} > + > +/* > + * Burn budget at microsecond level. > + */ > +static void > +burn_budgets(const struct scheduler *ops, struct rt_vcpu *svc, s_time_t now) > +{ > + s_time_t delta; > + long count = 0; > + > + /* don't burn budget for idle VCPU */ > + if ( is_idle_vcpu(svc->vcpu) ) > + { > + return; > + } > + > + /* first time called for this svc, update last_start */ > + if ( svc->last_start == 0 ) > + { > + svc->last_start = now; > + return; > + } > + > + /* > + * update deadline info: When deadline is in the past, > + * it need to be updated to the deadline of the current period, > + * and replenish the budget > + */ > + delta = now - svc->cur_deadline; > + if ( delta >= 0 ) > + { > + count = ( delta/MICROSECS(svc->period) ) + 1; > + svc->cur_deadline += count * MICROSECS(svc->period); > + svc->cur_budget = svc->budget * 1000; > + return; > + } > + > + /* burn at nanoseconds level */ > + delta = now - svc->last_start; > + /* > + * delta < 0 only happens in nested virtualization; > + * TODO: how should we handle delta < 0 in a better way? */ > + if ( delta < 0 ) > + { > + printk("%s, ATTENTION: now is behind last_start! delta = %ld for ", > + __func__, delta); > + rt_dump_vcpu(svc); > + svc->last_start = now; /* update last_start */ > + svc->cur_budget = 0; /* FIXME: should we recover like this? */ > + return; > + } > + > + if ( svc->cur_budget == 0 ) > + return; > + > + svc->cur_budget -= delta; > + if ( svc->cur_budget < 0 ) > + svc->cur_budget = 0; > +} > + > +/* > + * RunQ is sorted. Pick first one within cpumask. If no one, return NULL > + * lock is grabbed before calling this function > + */ > +static struct rt_vcpu * > +__runq_pick(const struct scheduler *ops, cpumask_t mask) > +{ > + struct list_head *runq = RUNQ(ops); > + struct list_head *iter; > + struct rt_vcpu *svc = NULL; > + struct rt_vcpu *iter_svc = NULL; > + cpumask_t cpu_common; > + cpumask_t *online; > + struct rt_private * prv = RT_PRIV(ops); > + > + list_for_each(iter, runq) > + { > + iter_svc = __runq_elem(iter); > + > + /* mask is intersection of cpu_hard_affinity and cpupool and > priv->cpus */ > + online = cpupool_scheduler_cpumask(iter_svc->vcpu->domain->cpupool); > + cpumask_and(&cpu_common, online, &prv->cpus); > + cpumask_and(&cpu_common, &cpu_common, > iter_svc->vcpu->cpu_hard_affinity); > + cpumask_and(&cpu_common, &mask, &cpu_common); > + if ( cpumask_empty(&cpu_common) ) > + continue; > + > + if ( iter_svc->cur_budget <= 0 ) > + continue; > + > + svc = iter_svc; > + break; > + } > + > + return svc; > +} > + > +/* > + * Update vcpu's budget and sort runq by insert the modifed vcpu back to runq > + * lock is grabbed before calling this function > + */ > +static void > +__repl_update(const struct scheduler *ops, s_time_t now) > +{ > + struct list_head *runq = RUNQ(ops); > + struct list_head *iter; > + struct list_head *tmp; > + struct rt_vcpu *svc = NULL; > + > + s_time_t diff; > + long count; > + > + list_for_each_safe(iter, tmp, runq) > + { > + svc = __runq_elem(iter); > + > + diff = now - svc->cur_deadline; > + if ( diff > 0 ) > + { > + count = (diff/MICROSECS(svc->period)) + 1; > + svc->cur_deadline += count * MICROSECS(svc->period); > + svc->cur_budget = svc->budget * 1000; > + __runq_remove(svc); > + __runq_insert(ops, svc); > + } > + } > +} > + > +/* > + * schedule function for rt scheduler. > + * The lock is already grabbed in schedule.c, no need to lock here > + */ > +static struct task_slice > +rt_schedule(const struct scheduler *ops, s_time_t now, bool_t > tasklet_work_scheduled) > +{ > + const int cpu = smp_processor_id(); > + struct rt_private * prv = RT_PRIV(ops); > + struct rt_vcpu * const scurr = RT_VCPU(current); > + struct rt_vcpu * snext = NULL; > + struct task_slice ret = { .migrated = 0 }; > + > + /* clear ticked bit now that we've been scheduled */ > + if ( cpumask_test_cpu(cpu, &prv->tickled) ) > + cpumask_clear_cpu(cpu, &prv->tickled); > + > + /* burn_budget would return for IDLE VCPU */ > + burn_budgets(ops, scurr, now); > + > + __repl_update(ops, now); > + > + if ( tasklet_work_scheduled ) > + { > + snext = RT_VCPU(idle_vcpu[cpu]); > + } > + else > + { > + cpumask_t cur_cpu; > + cpumask_clear(&cur_cpu); > + cpumask_set_cpu(cpu, &cur_cpu); > + snext = __runq_pick(ops, cur_cpu); > + if ( snext == NULL ) > + snext = RT_VCPU(idle_vcpu[cpu]); > + > + /* if scurr has higher priority and budget, still pick scurr */ > + if ( !is_idle_vcpu(current) && > + vcpu_runnable(current) && > + scurr->cur_budget > 0 && > + ( is_idle_vcpu(snext->vcpu) || > + scurr->cur_deadline <= snext->cur_deadline ) ) > + snext = scurr; > + } > + > + if ( snext != scurr && > + !is_idle_vcpu(current) && > + vcpu_runnable(current) ) > + set_bit(__RT_delayed_runq_add, &scurr->flags); > + > + > + snext->last_start = now; > + if ( !is_idle_vcpu(snext->vcpu) ) > + { > + if ( snext != scurr ) > + { > + __runq_remove(snext); > + set_bit(__RT_scheduled, &snext->flags); > + } > + if ( snext->vcpu->processor != cpu ) > + { > + snext->vcpu->processor = cpu; > + ret.migrated = 1; > + } > + } > + > + ret.time = MILLISECS(1); /* sched quantum */ > + ret.task = snext->vcpu; > + > + return ret; > +} > + > +/* > + * Remove VCPU from RunQ > + * The lock is already grabbed in schedule.c, no need to lock here > + */ > +static void > +rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc) > +{ > + struct rt_vcpu * const svc = RT_VCPU(vc); > + > + BUG_ON( is_idle_vcpu(vc) ); > + > + if ( curr_on_cpu(vc->processor) == vc ) > + { > + cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ); > + return; > + } > + > + if ( __vcpu_on_runq(svc) ) > + { > + __runq_remove(svc); > + printk("%s: vcpu should not on runq in vcpu_sleep()\n", > __FUNCTION__); > + BUG(); > + } > + > + clear_bit(__RT_delayed_runq_add, &svc->flags); > +} > + > +/* > + * Pick a vcpu on a cpu to kick out to place the running candidate > + * Called by wake() and context_saved() > + * We have a running candidate here, the kick logic is: > + * Among all the cpus that are within the cpu affinity > + * 1) if the new->cpu is idle, kick it. This could benefit cache hit > + * 2) if there are any idle vcpu, kick it. > + * 3) now all pcpus are busy, among all the running vcpus, pick lowest > priority one > + * if snext has higher priority, kick it. > + * > + * TODO: > + * 1) what if these two vcpus belongs to the same domain? > + * replace a vcpu belonging to the same domain introduces more overhead > + * > + * lock is grabbed before calling this function > + */ > +static void > +runq_tickle(const struct scheduler *ops, struct rt_vcpu *new) > +{ > + struct rt_private * prv = RT_PRIV(ops); > + struct rt_vcpu * latest_deadline_vcpu = NULL; /* lowest priority > scheduled */ > + struct rt_vcpu * iter_svc; > + struct vcpu * iter_vc; > + int cpu = 0; > + cpumask_t not_tickled; > + cpumask_t *online; > + > + if ( new == NULL || is_idle_vcpu(new->vcpu) ) > + return; > + > + online = cpupool_scheduler_cpumask(new->vcpu->domain->cpupool); > + cpumask_and(&not_tickled, online, &prv->cpus); > + cpumask_and(&not_tickled, &not_tickled, new->vcpu->cpu_hard_affinity); > + cpumask_andnot(&not_tickled, &not_tickled, &prv->tickled); > + > + /* 1) if new's previous cpu is idle, kick it for cache benefit */ > + if ( is_idle_vcpu(curr_on_cpu(new->vcpu->processor)) ) > + { > + cpumask_set_cpu(new->vcpu->processor, &prv->tickled); > + cpu_raise_softirq(new->vcpu->processor, SCHEDULE_SOFTIRQ); > + return; > + } > + > + /* 2) if there are any idle pcpu, kick it */ > + /* The same loop also find the one with lowest priority */ > + for_each_cpu(cpu, &not_tickled) > + { > + iter_vc = curr_on_cpu(cpu); > + if ( is_idle_vcpu(iter_vc) ) > + { > + cpumask_set_cpu(cpu, &prv->tickled); > + cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); > + return; > + } > + iter_svc = RT_VCPU(iter_vc); > + if ( latest_deadline_vcpu == NULL || > + iter_svc->cur_deadline > latest_deadline_vcpu->cur_deadline ) > + latest_deadline_vcpu = iter_svc; > + } > + > + /* 3) candicate has higher priority, kick out the lowest priority vcpu */ > + if ( latest_deadline_vcpu != NULL && new->cur_deadline < > latest_deadline_vcpu->cur_deadline ) > + { > + cpumask_set_cpu(latest_deadline_vcpu->vcpu->processor, > &prv->tickled); > + cpu_raise_softirq(latest_deadline_vcpu->vcpu->processor, > SCHEDULE_SOFTIRQ); > + } > + return; > +} > + > +/* > + * Should always wake up runnable vcpu, put it back to RunQ. > + * Check priority to raise interrupt > + * The lock is already grabbed in schedule.c, no need to lock here > + * TODO: what if these two vcpus belongs to the same domain? > + */ > +static void > +rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) > +{ > + struct rt_vcpu * const svc = RT_VCPU(vc); > + s_time_t diff; > + s_time_t now = NOW(); > + long count = 0; > + struct rt_private * prv = RT_PRIV(ops); > + struct rt_vcpu * snext = NULL; /* highest priority on RunQ */ > + > + BUG_ON( is_idle_vcpu(vc) ); > + > + if ( unlikely(curr_on_cpu(vc->processor) == vc) ) > + return; > + > + /* on RunQ, just update info is ok */ > + if ( unlikely(__vcpu_on_runq(svc)) ) > + return; > + > + /* If context hasn't been saved for this vcpu yet, we can't put it on > + * the Runqueue. Instead, we set a flag so that it will be put on the > Runqueue > + * After the context has been saved. */ > + if ( unlikely(test_bit(__RT_scheduled, &svc->flags)) ) > + { > + set_bit(__RT_delayed_runq_add, &svc->flags); > + return; > + } > + > + /* update deadline info */ > + diff = now - svc->cur_deadline; > + if ( diff >= 0 ) > + { > + count = ( diff/MICROSECS(svc->period) ) + 1; > + svc->cur_deadline += count * MICROSECS(svc->period); > + svc->cur_budget = svc->budget * 1000; > + } > + > + __runq_insert(ops, svc); > + __repl_update(ops, now); > + snext = __runq_pick(ops, prv->cpus); /* pick snext from ALL valid > cpus */ > + runq_tickle(ops, snext); > + > + return; > +} > + > +/* > + * scurr has finished context switch, insert it back to the RunQ, > + * and then pick the highest priority vcpu from runq to run > + */ > +static void > +rt_context_saved(const struct scheduler *ops, struct vcpu *vc) > +{ > + struct rt_vcpu * svc = RT_VCPU(vc); > + struct rt_vcpu * snext = NULL; > + struct rt_private * prv = RT_PRIV(ops); > + spinlock_t *lock = vcpu_schedule_lock_irq(vc); > + > + clear_bit(__RT_scheduled, &svc->flags); > + /* not insert idle vcpu to runq */ > + if ( is_idle_vcpu(vc) ) > + goto out; > + > + if ( test_and_clear_bit(__RT_delayed_runq_add, &svc->flags) && > + likely(vcpu_runnable(vc)) ) > + { > + __runq_insert(ops, svc); > + __repl_update(ops, NOW()); > + snext = __runq_pick(ops, prv->cpus); /* pick snext from ALL cpus > */ > + runq_tickle(ops, snext); > + } > +out: > + vcpu_schedule_unlock_irq(lock, vc); > +} > + > +/* > + * set/get each vcpu info of each domain > + */ > +static int > +rt_dom_cntl( > + const struct scheduler *ops, > + struct domain *d, > + struct xen_domctl_scheduler_op *op) > +{ > + xen_domctl_sched_rt_params_t *local_sched; > + struct rt_dom * const sdom = RT_DOM(d); > + struct list_head *iter; > + int vcpu_index = 0; > + int rc = -EINVAL; > + > + switch ( op->cmd ) > + { > + case XEN_DOMCTL_SCHEDOP_getnumvcpus: > + op->u.rt.nr_vcpus = 0; > + list_for_each( iter, &sdom->vcpu ) > + vcpu_index++; > + op->u.rt.nr_vcpus = vcpu_index; > + rc = 0; > + break; > + case XEN_DOMCTL_SCHEDOP_getinfo: > + /* for debug use, whenever adjust Dom0 parameter, do global dump */ > + if ( d->domain_id == 0 ) > + rt_dump(ops); > + > + op->u.rt.nr_vcpus = 0; > + list_for_each( iter, &sdom->vcpu ) > + vcpu_index++; > + op->u.rt.nr_vcpus = vcpu_index; > + local_sched = xzalloc_array(xen_domctl_sched_rt_params_t, > vcpu_index); > + vcpu_index = 0; > + list_for_each( iter, &sdom->vcpu ) > + { > + struct rt_vcpu * svc = list_entry(iter, struct rt_vcpu, > sdom_elem); > + > + local_sched[vcpu_index].budget = svc->budget; > + local_sched[vcpu_index].period = svc->period; > + local_sched[vcpu_index].index = vcpu_index; > + vcpu_index++; > + } > + copy_to_guest(op->u.rt.vcpu, local_sched, vcpu_index); This will clobber guest heap if vcpu_index is greater than the allocated space. You also unconditionally leak local_sched, but there is no need for an allocation in the first place. > + rc = 0; > + break; > + case XEN_DOMCTL_SCHEDOP_putinfo: > + list_for_each( iter, &sdom->vcpu ) > + { > + struct rt_vcpu * svc = list_entry(iter, struct rt_vcpu, > sdom_elem); > + > + /* adjust per VCPU parameter */ > + if ( op->u.rt.vcpu_index == svc->vcpu->vcpu_id ) > + { > + vcpu_index = op->u.rt.vcpu_index; > + > + if ( vcpu_index < 0 ) > + printk("XEN_DOMCTL_SCHEDOP_putinfo: vcpu_index=%d\n", > + vcpu_index); > + else > + printk("XEN_DOMCTL_SCHEDOP_putinfo: " > + "vcpu_index=%d, period=%"PRId64", > budget=%"PRId64"\n", > + vcpu_index, op->u.rt.period, op->u.rt.budget); > + > + svc->period = op->u.rt.period; > + svc->budget = op->u.rt.budget; > + > + break; > + } > + } > + rc = 0; > + break; > + } > + > + return rc; > +} > + > +static struct rt_private _rt_priv; > + > +const struct scheduler sched_rt_def = { > + .name = "SMP RT Scheduler", > + .opt_name = "rt", Should these names reflect RT_DS as opposed to simply RT? > + .sched_id = XEN_SCHEDULER_RT_DS, > + .sched_data = &_rt_priv, > + > + .dump_cpu_state = rt_dump_pcpu, > + .dump_settings = rt_dump, > + .init = rt_init, > + .deinit = rt_deinit, > + .alloc_pdata = rt_alloc_pdata, > + .free_pdata = rt_free_pdata, > + .alloc_domdata = rt_alloc_domdata, > + .free_domdata = rt_free_domdata, > + .init_domain = rt_dom_init, > + .destroy_domain = rt_dom_destroy, > + .alloc_vdata = rt_alloc_vdata, > + .free_vdata = rt_free_vdata, > + .insert_vcpu = rt_vcpu_insert, > + .remove_vcpu = rt_vcpu_remove, > + > + .adjust = rt_dom_cntl, > + > + .pick_cpu = rt_cpu_pick, > + .do_schedule = rt_schedule, > + .sleep = rt_vcpu_sleep, > + .wake = rt_vcpu_wake, > + .context_saved = rt_context_saved, > +}; > diff --git a/xen/common/schedule.c b/xen/common/schedule.c > index e9eb0bc..f2df400 100644 > --- a/xen/common/schedule.c > +++ b/xen/common/schedule.c > @@ -68,6 +68,7 @@ static const struct scheduler *schedulers[] = { > &sched_sedf_def, > &sched_credit_def, > &sched_credit2_def, > + &sched_rt_def, > &sched_arinc653_def, > }; > > @@ -1092,7 +1093,8 @@ long sched_adjust(struct domain *d, struct > xen_domctl_scheduler_op *op) > > if ( (op->sched_id != DOM2OP(d)->sched_id) || > ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) && > - (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) ) > + (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo) && > + (op->cmd != XEN_DOMCTL_SCHEDOP_getnumvcpus)) ) > return -EINVAL; > > /* NB: the pluggable scheduler code needs to take care > diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h > index 5b11bbf..8d4b973 100644 > --- a/xen/include/public/domctl.h > +++ b/xen/include/public/domctl.h > @@ -339,6 +339,18 @@ struct xen_domctl_max_vcpus { > typedef struct xen_domctl_max_vcpus xen_domctl_max_vcpus_t; > DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t); > > +/* > + * This structure is used to pass to rt scheduler from a > + * privileged domain to Xen > + */ > +struct xen_domctl_sched_rt_params { > + /* get vcpus' info */ > + int64_t period; /* s_time_t type */ > + int64_t budget; > + int index; Index is clearly an unsigned quantity. For alignment and compatibility, uint64_t would make the most sense. Alternatively, uint32_t and an explicit uint32_t pad field. ~Andrew > +}; > +typedef struct xen_domctl_sched_rt_params xen_domctl_sched_rt_params_t; > +DEFINE_XEN_GUEST_HANDLE(xen_domctl_sched_rt_params_t); > > /* XEN_DOMCTL_scheduler_op */ > /* Scheduler types. */ > @@ -346,9 +358,12 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t); > #define XEN_SCHEDULER_CREDIT 5 > #define XEN_SCHEDULER_CREDIT2 6 > #define XEN_SCHEDULER_ARINC653 7 > +#define XEN_SCHEDULER_RT_DS 8 > + > /* Set or get info? */ > -#define XEN_DOMCTL_SCHEDOP_putinfo 0 > -#define XEN_DOMCTL_SCHEDOP_getinfo 1 > +#define XEN_DOMCTL_SCHEDOP_putinfo 0 > +#define XEN_DOMCTL_SCHEDOP_getinfo 1 > +#define XEN_DOMCTL_SCHEDOP_getnumvcpus 2 > struct xen_domctl_scheduler_op { > uint32_t sched_id; /* XEN_SCHEDULER_* */ > uint32_t cmd; /* XEN_DOMCTL_SCHEDOP_* */ > @@ -367,6 +382,15 @@ struct xen_domctl_scheduler_op { > struct xen_domctl_sched_credit2 { > uint16_t weight; > } credit2; > + struct xen_domctl_sched_rt{ > + /* get vcpus' params */ > + XEN_GUEST_HANDLE_64(xen_domctl_sched_rt_params_t) vcpu; > + uint16_t nr_vcpus; > + /* set one vcpu's params */ > + uint16_t vcpu_index; > + int64_t period; > + int64_t budget; > + } rt; > } u; > }; > typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t; > diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h > index 4164dff..bcbe234 100644 > --- a/xen/include/xen/sched-if.h > +++ b/xen/include/xen/sched-if.h > @@ -169,6 +169,7 @@ extern const struct scheduler sched_sedf_def; > extern const struct scheduler sched_credit_def; > extern const struct scheduler sched_credit2_def; > extern const struct scheduler sched_arinc653_def; > +extern const struct scheduler sched_rt_def; > > > struct cpupool _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.