Xen project Mailing List

[Xen-devel] [PATCH v3 1/4] xen: add real time scheduler rtds

This scheduler follows the Preemptive Global Earliest Deadline First (EDF) theory in real-time field. At any scheduling point, the VCPU with earlier deadline has higher priority. The scheduler always picks the highest priority VCPU to run on a feasible PCPU. A PCPU is feasible if the VCPU can run on this PCPU and (the PCPU is idle or has a lower-priority VCPU running on it.) Each VCPU has a dedicated period and budget. The deadline of a VCPU is at the end of each period; A VCPU has its budget replenished at the beginning of each period; While scheduled, a VCPU burns its budget. The VCPU needs to finish its budget before its deadline in each period; The VCPU discards its unused budget at the end of each period. If a VCPU runs out of budget in a period, it has to wait until next period. Each VCPU is implemented as a deferable server. When a VCPU has a task running on it, its budget is continuously burned; When a VCPU has no task but with budget left, its budget is preserved. Queue scheme: A global runqueue and a global depletedq for each CPU pool. The runqueue holds all runnable VCPUs with budget and sorted by deadline; The depletedq holds all VCPUs without budget and unsorted. Note: cpumask and cpupool is supported. This is an experimental scheduler. Signed-off-by: Meng Xu <mengxu@xxxxxxxxxxxxx> Signed-off-by: Sisu Xi <xisisu@xxxxxxxxx> --- xen/common/Makefile | 1 + xen/common/sched_rt.c | 1126 +++++++++++++++++++++++++++++++++++++++++++ xen/common/schedule.c | 1 + xen/include/public/domctl.h | 6 + xen/include/public/trace.h | 1 + xen/include/xen/sched-if.h | 1 + 6 files changed, 1136 insertions(+) create mode 100644 xen/common/sched_rt.c diff --git a/xen/common/Makefile b/xen/common/Makefile index 3683ae3..5a23aa4 100644 --- a/xen/common/Makefile +++ b/xen/common/Makefile @@ -26,6 +26,7 @@ obj-y += sched_credit.o obj-y += sched_credit2.o obj-y += sched_sedf.o obj-y += sched_arinc653.o +obj-y += sched_rt.o obj-y += schedule.o obj-y += shutdown.o obj-y += softirq.o diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c new file mode 100644 index 0000000..7ee2500 --- /dev/null +++ b/xen/common/sched_rt.c @@ -0,0 +1,1126 @@ +/***************************************************************************** + * Preemptive Global Earliest Deadline First (EDF) scheduler for Xen + * EDF scheduling is a real-time scheduling algorithm used in embedded field. + * + * by Sisu Xi, 2013, Washington University in Saint Louis + * and Meng Xu, 2014, University of Pennsylvania + * + * based on the code of credit Scheduler + */ + +#include <xen/config.h> +#include <xen/init.h> +#include <xen/lib.h> +#include <xen/sched.h> +#include <xen/domain.h> +#include <xen/delay.h> +#include <xen/event.h> +#include <xen/time.h> +#include <xen/perfc.h> +#include <xen/sched-if.h> +#include <xen/softirq.h> +#include <asm/atomic.h> +#include <xen/errno.h> +#include <xen/trace.h> +#include <xen/cpu.h> +#include <xen/keyhandler.h> +#include <xen/trace.h> +#include <xen/guest_access.h> + +/* + * TODO: + * + * Migration compensation and resist like credit2 to better use cache; + * Lock Holder Problem, using yield? + * Self switch problem: VCPUs of the same domain may preempt each other; + */ + +/* + * Design: + * + * This scheduler follows the Preemptive Global Earliest Deadline First (EDF) + * theory in real-time field. + * At any scheduling point, the VCPU with earlier deadline has higher priority. + * The scheduler always picks highest priority VCPU to run on a feasible PCPU. + * A PCPU is feasible if the VCPU can run on this PCPU and (the PCPU is idle or + * has a lower-priority VCPU running on it.) + * + * Each VCPU has a dedicated period and budget. + * The deadline of a VCPU is at the end of each period; + * A VCPU has its budget replenished at the beginning of each period; + * While scheduled, a VCPU burns its budget. + * The VCPU needs to finish its budget before its deadline in each period; + * The VCPU discards its unused budget at the end of each period. + * If a VCPU runs out of budget in a period, it has to wait until next period. + * + * Each VCPU is implemented as a deferable server. + * When a VCPU has a task running on it, its budget is continuously burned; + * When a VCPU has no task but with budget left, its budget is preserved. + * + * Queue scheme: + * A global runqueue and a global depletedqueue for each CPU pool. + * The runqueue holds all runnable VCPUs with budget, sorted by deadline; + * The depletedqueue holds all VCPUs without budget, unsorted; + * + * Note: cpumask and cpupool is supported. + */ + +/* + * Locking: + * A global system lock is used to protect the RunQ and DepletedQ. + * The global lock is referenced by schedule_data.schedule_lock + * from all physical cpus. + * + * The lock is already grabbed when calling wake/sleep/schedule/ functions + * in schedule.c + * + * The functions involes RunQ and needs to grab locks are: + * vcpu_insert, vcpu_remove, context_saved, __runq_insert + */ + + +/* + * Default parameters: + * Period and budget in default is 10 and 4 ms, respectively + */ +#define RTDS_DEFAULT_PERIOD (MICROSECS(10000)) +#define RTDS_DEFAULT_BUDGET (MICROSECS(4000)) + +/* + * Flags + */ +/* + * RTDS_scheduled: Is this vcpu either running on, or context-switching off, + * a phyiscal cpu? + * + Accessed only with global lock held. + * + Set when chosen as next in rt_schedule(). + * + Cleared after context switch has been saved in rt_context_saved() + * + Checked in vcpu_wake to see if we can add to the Runqueue, or if we should + * set RTDS_delayed_runq_add + * + Checked to be false in runq_insert. + */ +#define __RTDS_scheduled 1 +#define RTDS_scheduled (1<<__RTDS_scheduled) +/* + * RTDS_delayed_runq_add: Do we need to add this to the RunQ/DepletedQ + * once it's done being context switching out? + * + Set when scheduling out in rt_schedule() if prev is runable + * + Set in rt_vcpu_wake if it finds RTDS_scheduled set + * + Read in rt_context_saved(). If set, it adds prev to the Runqueue/DepletedQ + * and clears the bit. + */ +#define __RTDS_delayed_runq_add 2 +#define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add) + +/* + * rt tracing events ("only" 512 available!). Check + * include/public/trace.h for more details. + */ +#define TRC_RTDS_TICKLE TRC_SCHED_CLASS_EVT(RTDS, 1) +#define TRC_RTDS_RUNQ_PICK TRC_SCHED_CLASS_EVT(RTDS, 2) +#define TRC_RTDS_BUDGET_BURN TRC_SCHED_CLASS_EVT(RTDS, 3) +#define TRC_RTDS_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RTDS, 4) +#define TRC_RTDS_SCHED_TASKLET TRC_SCHED_CLASS_EVT(RTDS, 5) +#define TRC_RTDS_VCPU_CREATE TRC_SCHED_CLASS_EVT(RTDS, 6) + +/* + * Systme-wide private data, include global RunQueue/DepletedQ + * Global lock is referenced by schedule_data.schedule_lock from all + * physical cpus. It can be grabbed via vcpu_schedule_lock_irq() + */ +struct rt_private { + spinlock_t lock; /* the global coarse grand lock */ + struct list_head sdom; /* list of availalbe domains, used for dump */ + struct list_head runq; /* ordered list of runnable vcpus */ + struct list_head depletedq; /* unordered list of depleted vcpus */ + cpumask_t tickled; /* cpus been tickled */ +}; + +/* + * Virtual CPU + */ +struct rt_vcpu { + struct list_head q_elem; /* on the runq/depletedq list */ + struct list_head sdom_elem; /* on the domain VCPU list */ + + /* Up-pointers */ + struct rt_dom *sdom; + struct vcpu *vcpu; + + /* VCPU parameters, in nanoseconds */ + s_time_t period; + s_time_t budget; + + /* VCPU current infomation in nanosecond */ + s_time_t cur_budget; /* current budget */ + s_time_t last_start; /* last start time */ + s_time_t cur_deadline; /* current deadline for EDF */ + + unsigned flags; /* mark __RTDS_scheduled, etc.. */ +}; + +/* + * Domain + */ +struct rt_dom { + struct list_head vcpu; /* link its VCPUs */ + struct list_head sdom_elem; /* link list on rt_priv */ + struct domain *dom; /* pointer to upper domain */ +}; + +/* + * Useful inline functions + */ +static inline struct rt_private *rt_priv(const struct scheduler *ops) +{ + return ops->sched_data; +} + +static inline struct rt_vcpu *rt_vcpu(const struct vcpu *vcpu) +{ + return vcpu->sched_priv; +} + +static inline struct rt_dom *rt_dom(const struct domain *dom) +{ + return dom->sched_priv; +} + +static inline struct list_head *rt_runq(const struct scheduler *ops) +{ + return &rt_priv(ops)->runq; +} + +static inline struct list_head *rt_depletedq(const struct scheduler *ops) +{ + return &rt_priv(ops)->depletedq; +} + +/* + * Queue helper functions for runq and depletedq + */ +static int +__vcpu_on_q(const struct rt_vcpu *svc) +{ + return !list_empty(&svc->q_elem); +} + +static struct rt_vcpu * +__q_elem(struct list_head *elem) +{ + return list_entry(elem, struct rt_vcpu, q_elem); +} + +/* + * Debug related code, dump vcpu/cpu information + */ +static void +rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc) +{ + char cpustr[1024]; + cpumask_t *cpupool_mask; + + ASSERT(svc != NULL); + /* flag vcpu */ + if( svc->sdom == NULL ) + return; + + cpumask_scnprintf(cpustr, sizeof(cpustr), svc->vcpu->cpu_hard_affinity); + printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime")," + " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n" + " \t\t onQ=%d runnable=%d cpu_hard_affinity=%s ", + svc->vcpu->domain->domain_id, + svc->vcpu->vcpu_id, + svc->vcpu->processor, + svc->period, + svc->budget, + svc->cur_budget, + svc->cur_deadline, + svc->last_start, + __vcpu_on_q(svc), + vcpu_runnable(svc->vcpu), + cpustr); + memset(cpustr, 0, sizeof(cpustr)); + cpupool_mask = cpupool_scheduler_cpumask(svc->vcpu->domain->cpupool); + cpumask_scnprintf(cpustr, sizeof(cpustr), cpupool_mask); + printk("cpupool=%s\n", cpustr); +} + +static void +rt_dump_pcpu(const struct scheduler *ops, int cpu) +{ + struct rt_vcpu *svc = rt_vcpu(curr_on_cpu(cpu)); + + rt_dump_vcpu(ops, svc); +} + +static void +rt_dump(const struct scheduler *ops) +{ + struct list_head *iter_sdom, *iter_svc, *runq, *depletedq, *iter; + struct rt_private *prv = rt_priv(ops); + struct rt_vcpu *svc; + unsigned int cpu = 0; + cpumask_t *online; + struct rt_dom *sdom; + unsigned long flags; + + ASSERT(!list_empty(&prv->sdom)); + sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem); + online = cpupool_scheduler_cpumask(sdom->dom->cpupool); + runq = rt_runq(ops); + depletedq = rt_depletedq(ops); + + printk("PCPU info:\n"); + for_each_cpu(cpu, online) + rt_dump_pcpu(ops, cpu); + + printk("Global RunQueue info:\n"); + spin_lock_irqsave(&prv->lock, flags); + list_for_each( iter, runq ) + { + svc = __q_elem(iter); + rt_dump_vcpu(ops, svc); + } + + printk("Global DepletedQueue info:\n"); + list_for_each( iter, depletedq ) + { + svc = __q_elem(iter); + rt_dump_vcpu(ops, svc); + } + + printk("Domain info:\n"); + list_for_each( iter_sdom, &prv->sdom ) + { + sdom = list_entry(iter_sdom, struct rt_dom, sdom_elem); + printk("\tdomain: %d\n", sdom->dom->domain_id); + + list_for_each( iter_svc, &sdom->vcpu ) + { + svc = list_entry(iter_svc, struct rt_vcpu, sdom_elem); + rt_dump_vcpu(ops, svc); + } + } + + spin_unlock_irqrestore(&prv->lock, flags); +} + +/* + * update deadline and budget when now >= cur_deadline + * it need to be updated to the deadline of the current period + */ +static void +rt_update_deadline(s_time_t now, struct rt_vcpu *svc) +{ + ASSERT(now >= svc->cur_deadline); + ASSERT(svc->period != 0); + + do + svc->cur_deadline += svc->period; + while ( svc->cur_deadline <= now ); + + svc->cur_budget = svc->budget; + + /* TRACE */ + { + struct { + unsigned dom:16,vcpu:16; + unsigned cur_deadline_lo, cur_deadline_hi; + unsigned cur_budget_lo, cur_budget_hi; + } d; + d.dom = svc->vcpu->domain->domain_id; + d.vcpu = svc->vcpu->vcpu_id; + d.cur_deadline_lo = (unsigned) svc->cur_deadline; + d.cur_deadline_hi = (unsigned) (svc->cur_deadline >> 32); + d.cur_budget_lo = (unsigned) svc->cur_budget; + d.cur_budget_hi = (unsigned) (svc->cur_budget >> 32); + trace_var(TRC_RTDS_BUDGET_REPLENISH, 1, + sizeof(d), + (unsigned char *) &d); + } + + return; +} + +static inline void +__q_remove(struct rt_vcpu *svc) +{ + if ( __vcpu_on_q(svc) ) + list_del_init(&svc->q_elem); +} + +/* + * Insert svc with budget in RunQ according to EDF: + * vcpus with smaller deadlines go first. + * Insert svc without budget in DepletedQ unsorted; + */ +static void +__runq_insert(const struct scheduler *ops, struct rt_vcpu *svc) +{ + struct rt_private *prv = rt_priv(ops); + struct list_head *runq = rt_runq(ops); + struct list_head *iter; + + ASSERT( spin_is_locked(&prv->lock) ); + + ASSERT( !__vcpu_on_q(svc) ); + + /* add svc to runq if svc still has budget */ + if ( svc->cur_budget > 0 ) + { + list_for_each(iter, runq) + { + struct rt_vcpu * iter_svc = __q_elem(iter); + if ( svc->cur_deadline <= iter_svc->cur_deadline ) + break; + } + list_add_tail(&svc->q_elem, iter); + } + else + { + list_add(&svc->q_elem, &prv->depletedq); + } +} + +/* + * Init/Free related code + */ +static int +rt_init(struct scheduler *ops) +{ + struct rt_private *prv = xzalloc(struct rt_private); + + printk("Initializing RTDS scheduler\n" \ + "WARNING: This is experimental software in development.\n" \ + "Use at your own risk.\n"); + + if ( prv == NULL ) + return -ENOMEM; + + spin_lock_init(&prv->lock); + INIT_LIST_HEAD(&prv->sdom); + INIT_LIST_HEAD(&prv->runq); + INIT_LIST_HEAD(&prv->depletedq); + + cpumask_clear(&prv->tickled); + + ops->sched_data = prv; + + return 0; +} + +static void +rt_deinit(const struct scheduler *ops) +{ + struct rt_private *prv = rt_priv(ops); + + xfree(prv); +} + +/* + * Point per_cpu spinlock to the global system lock; + * All cpu have same global system lock + */ +static void * +rt_alloc_pdata(const struct scheduler *ops, int cpu) +{ + struct rt_private *prv = rt_priv(ops); + + per_cpu(schedule_data, cpu).schedule_lock = &prv->lock; + + /* 1 indicates alloc. succeed in schedule.c */ + return (void *)1; +} + +static void +rt_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) +{ + per_cpu(schedule_data, cpu).schedule_lock = NULL; +} + +static void * +rt_alloc_domdata(const struct scheduler *ops, struct domain *dom) +{ + unsigned long flags; + struct rt_dom *sdom; + struct rt_private * prv = rt_priv(ops); + + sdom = xzalloc(struct rt_dom); + if ( sdom == NULL ) + return NULL; + + INIT_LIST_HEAD(&sdom->vcpu); + INIT_LIST_HEAD(&sdom->sdom_elem); + sdom->dom = dom; + + /* spinlock here to insert the dom */ + spin_lock_irqsave(&prv->lock, flags); + list_add_tail(&sdom->sdom_elem, &(prv->sdom)); + spin_unlock_irqrestore(&prv->lock, flags); + + return sdom; +} + +static void +rt_free_domdata(const struct scheduler *ops, void *data) +{ + unsigned long flags; + struct rt_dom *sdom = data; + struct rt_private *prv = rt_priv(ops); + + spin_lock_irqsave(&prv->lock, flags); + list_del_init(&sdom->sdom_elem); + spin_unlock_irqrestore(&prv->lock, flags); + xfree(data); +} + +static int +rt_dom_init(const struct scheduler *ops, struct domain *dom) +{ + struct rt_dom *sdom; + + /* IDLE Domain does not link on rt_private */ + if ( is_idle_domain(dom) ) + return 0; + + sdom = rt_alloc_domdata(ops, dom); + if ( sdom == NULL ) + return -ENOMEM; + + dom->sched_priv = sdom; + + return 0; +} + +static void +rt_dom_destroy(const struct scheduler *ops, struct domain *dom) +{ + rt_free_domdata(ops, rt_dom(dom)); +} + +static void * +rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd) +{ + struct rt_vcpu *svc; + long count; + s_time_t now = NOW(); + + /* Allocate per-VCPU info */ + svc = xzalloc(struct rt_vcpu); + if ( svc == NULL ) + return NULL; + + INIT_LIST_HEAD(&svc->q_elem); + INIT_LIST_HEAD(&svc->sdom_elem); + svc->flags = 0U; + svc->sdom = dd; + svc->vcpu = vc; + svc->last_start = 0; + + svc->period = RTDS_DEFAULT_PERIOD; + if ( !is_idle_vcpu(vc) ) + svc->budget = RTDS_DEFAULT_BUDGET; + + ASSERT( now >= svc->cur_deadline ); + + count = now / svc->period + 1; + svc->cur_deadline = count * svc->period; + svc->cur_budget = svc->budget; + + /* TRACE */ + { + struct { + unsigned dom:16,vcpu:16; + unsigned cur_deadline_lo, cur_deadline_hi; + unsigned cur_budget_lo, cur_budget_hi; + } d; + d.dom = svc->vcpu->domain->domain_id; + d.vcpu = svc->vcpu->vcpu_id; + d.cur_deadline_lo = (unsigned) svc->cur_deadline; + d.cur_deadline_hi = (unsigned) (svc->cur_deadline >> 32); + d.cur_budget_lo = (unsigned) svc->cur_budget; + d.cur_budget_hi = (unsigned) (svc->cur_budget >> 32); + trace_var(TRC_RTDS_VCPU_CREATE, 1, + sizeof(d), + (unsigned char *) &d); + } + + return svc; +} + +static void +rt_free_vdata(const struct scheduler *ops, void *priv) +{ + struct rt_vcpu *svc = priv; + + xfree(svc); +} + +/* + * This function is called in sched_move_domain() in schedule.c + * When move a domain to a new cpupool. + * It inserts vcpus of moving domain to the scheduler's RunQ in + * dest. cpupool; and insert rt_vcpu svc to scheduler-specific + * vcpu list of the dom + */ +static void +rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) +{ + struct rt_private *prv = rt_priv(ops); + struct rt_vcpu *svc = rt_vcpu(vc); + + /* not addlocate idle vcpu to dom vcpu list */ + if ( is_idle_vcpu(vc) ) + return; + + if ( !__vcpu_on_q(svc) && vcpu_runnable(vc) && !vc->is_running ) + { + if ( svc->cur_budget > 0 ) + __runq_insert(ops, svc); + else + list_add(&svc->q_elem, &prv->depletedq); + } + + /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */ + list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu); +} + +/* + * Remove rt_vcpu svc from the old scheduler in source cpupool; and + * Remove rt_vcpu svc from scheduler-specific vcpu list of the dom + */ +static void +rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc) +{ + struct rt_vcpu * const svc = rt_vcpu(vc); + struct rt_dom * const sdom = svc->sdom; + + BUG_ON( sdom == NULL ); + + if ( __vcpu_on_q(svc) ) + __q_remove(svc); + + if ( !is_idle_vcpu(vc) ) + list_del_init(&svc->sdom_elem); +} + +/* + * Pick a valid CPU for the vcpu vc + * Valid CPU of a vcpu is intesection of vcpu's affinity + * and available cpus + */ +static int +rt_cpu_pick(const struct scheduler *ops, struct vcpu *vc) +{ + cpumask_t cpus; + cpumask_t *online; + int cpu; + + online = cpupool_scheduler_cpumask(vc->domain->cpupool); + cpumask_and(&cpus, online, vc->cpu_hard_affinity); + + cpu = cpumask_test_cpu(vc->processor, &cpus) + ? vc->processor + : cpumask_cycle(vc->processor, &cpus); + ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) ); + + return cpu; +} + +/* + * Burn budget in nanosecond granularity + */ +static void +burn_budget(const struct scheduler *ops, struct rt_vcpu *svc, s_time_t now) +{ + s_time_t delta; + + /* don't burn budget for idle VCPU */ + if ( is_idle_vcpu(svc->vcpu) ) + return; + + /* burn at nanoseconds level */ + delta = now - svc->last_start; + /* + * delta < 0 only happens in nested virtualization; + * TODO: how should we handle delta < 0 in a better way? + */ + if ( delta < 0 ) + { + printk("%s, ATTENTION: now is behind last_start! delta = %ld\n", + __func__, delta); + svc->last_start = now; + return; + } + + svc->cur_budget -= delta; + + if ( svc->cur_budget < 0 ) + svc->cur_budget = 0; + + /* TRACE */ + { + struct { + unsigned dom:16, vcpu:16; + unsigned cur_budget_lo; + unsigned cur_budget_hi; + int delta; + } d; + d.dom = svc->vcpu->domain->domain_id; + d.vcpu = svc->vcpu->vcpu_id; + d.cur_budget_lo = (unsigned) svc->cur_budget; + d.cur_budget_hi = (unsigned) (svc->cur_budget >> 32); + d.delta = delta; + trace_var(TRC_RTDS_BUDGET_BURN, 1, + sizeof(d), + (unsigned char *) &d); + } +} + +/* + * RunQ is sorted. Pick first one within cpumask. If no one, return NULL + * lock is grabbed before calling this function + */ +static struct rt_vcpu * +__runq_pick(const struct scheduler *ops, cpumask_t *mask) +{ + struct list_head *runq = rt_runq(ops); + struct list_head *iter; + struct rt_vcpu *svc = NULL; + struct rt_vcpu *iter_svc = NULL; + cpumask_t cpu_common; + cpumask_t *online; + + list_for_each(iter, runq) + { + iter_svc = __q_elem(iter); + + /* mask cpu_hard_affinity & cpupool & mask */ + online = cpupool_scheduler_cpumask(iter_svc->vcpu->domain->cpupool); + cpumask_and(&cpu_common, online, iter_svc->vcpu->cpu_hard_affinity); + cpumask_and(&cpu_common, mask, &cpu_common); + if ( cpumask_empty(&cpu_common) ) + continue; + + ASSERT( iter_svc->cur_budget > 0 ); + + svc = iter_svc; + break; + } + + /* TRACE */ + { + if( svc != NULL ) + { + struct { + unsigned dom:16, vcpu:16; + unsigned cur_deadline_lo, cur_deadline_hi; + unsigned cur_budget_lo, cur_budget_hi; + } d; + d.dom = svc->vcpu->domain->domain_id; + d.vcpu = svc->vcpu->vcpu_id; + d.cur_deadline_lo = (unsigned) svc->cur_deadline; + d.cur_deadline_hi = (unsigned) (svc->cur_deadline >> 32); + d.cur_budget_lo = (unsigned) svc->cur_budget; + d.cur_budget_hi = (unsigned) (svc->cur_budget >> 32); + trace_var(TRC_RTDS_RUNQ_PICK, 1, + sizeof(d), + (unsigned char *) &d); + } + else + trace_var(TRC_RTDS_RUNQ_PICK, 1, 0, NULL); + } + + return svc; +} + +/* + * Update vcpu's budget and sort runq by insert the modifed vcpu back to runq + * lock is grabbed before calling this function + */ +static void +__repl_update(const struct scheduler *ops, s_time_t now) +{ + struct list_head *runq = rt_runq(ops); + struct list_head *depletedq = rt_depletedq(ops); + struct list_head *iter; + struct list_head *tmp; + struct rt_vcpu *svc = NULL; + + list_for_each_safe(iter, tmp, runq) + { + svc = __q_elem(iter); + + if ( now >= svc->cur_deadline ) + { + rt_update_deadline(now, svc); + /* reinsert the vcpu if its deadline is updated */ + __q_remove(svc); + __runq_insert(ops, svc); + } + else + break; + } + + list_for_each_safe(iter, tmp, depletedq) + { + svc = __q_elem(iter); + if ( now >= svc->cur_deadline ) + { + rt_update_deadline(now, svc); + __q_remove(svc); /* remove from depleted queue */ + __runq_insert(ops, svc); /* add to runq */ + } + } +} + +/* + * schedule function for rt scheduler. + * The lock is already grabbed in schedule.c, no need to lock here + */ +static struct task_slice +rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled) +{ + const int cpu = smp_processor_id(); + struct rt_private *prv = rt_priv(ops); + struct rt_vcpu *const scurr = rt_vcpu(current); + struct rt_vcpu *snext = NULL; + struct task_slice ret = { .migrated = 0 }; + + /* clear ticked bit now that we've been scheduled */ + cpumask_clear_cpu(cpu, &prv->tickled); + + /* burn_budget would return for IDLE VCPU */ + burn_budget(ops, scurr, now); + + __repl_update(ops, now); + + if ( tasklet_work_scheduled ) + { + snext = rt_vcpu(idle_vcpu[cpu]); + } + else + { + cpumask_t cur_cpu; + cpumask_clear(&cur_cpu); + cpumask_set_cpu(cpu, &cur_cpu); + snext = __runq_pick(ops, &cur_cpu); + if ( snext == NULL ) + snext = rt_vcpu(idle_vcpu[cpu]); + + /* if scurr has higher priority and budget, still pick scurr */ + if ( !is_idle_vcpu(current) && + vcpu_runnable(current) && + scurr->cur_budget > 0 && + ( is_idle_vcpu(snext->vcpu) || + scurr->cur_deadline <= snext->cur_deadline ) ) + snext = scurr; + } + + if ( snext != scurr && + !is_idle_vcpu(current) && + vcpu_runnable(current) ) + set_bit(__RTDS_delayed_runq_add, &scurr->flags); + + snext->last_start = now; + if ( !is_idle_vcpu(snext->vcpu) ) + { + if ( snext != scurr ) + { + __q_remove(snext); + set_bit(__RTDS_scheduled, &snext->flags); + } + if ( snext->vcpu->processor != cpu ) + { + snext->vcpu->processor = cpu; + ret.migrated = 1; + } + } + + ret.time = MILLISECS(1); /* sched quantum */ + ret.task = snext->vcpu; + + /* TRACE */ + { + struct { + unsigned dom:16,vcpu:16; + unsigned cur_deadline_lo, cur_deadline_hi; + unsigned cur_budget_lo, cur_budget_hi; + } d; + d.dom = snext->vcpu->domain->domain_id; + d.vcpu = snext->vcpu->vcpu_id; + d.cur_deadline_lo = (unsigned) snext->cur_deadline; + d.cur_deadline_hi = (unsigned) (snext->cur_deadline >> 32); + d.cur_budget_lo = (unsigned) snext->cur_budget; + d.cur_budget_hi = (unsigned) (snext->cur_budget >> 32); + trace_var(TRC_RTDS_SCHED_TASKLET, 1, + sizeof(d), + (unsigned char *)&d); + } + + return ret; +} + +/* + * Remove VCPU from RunQ + * The lock is already grabbed in schedule.c, no need to lock here + */ +static void +rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc) +{ + struct rt_vcpu * const svc = rt_vcpu(vc); + + BUG_ON( is_idle_vcpu(vc) ); + + if ( curr_on_cpu(vc->processor) == vc ) + cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ); + else if ( __vcpu_on_q(svc) ) + __q_remove(svc); + else if ( test_bit(__RTDS_delayed_runq_add, &svc->flags) ) + clear_bit(__RTDS_delayed_runq_add, &svc->flags); +} + +/* + * Pick a cpu where to run a vcpu, possibly kicking out the vcpu running there + * Called by wake() and context_saved() + * We have a running candidate here, the kick logic is: + * Among all the cpus that are within the cpu affinity + * 1) if the new->cpu is idle, kick it. This could benefit cache hit + * 2) if there are any idle vcpu, kick it. + * 3) now all pcpus are busy, among all the running vcpus, pick lowest priority one + * if snext has higher priority, kick it. + * + * TODO: + * 1) what if these two vcpus belongs to the same domain? + * replace a vcpu belonging to the same domain introduces more overhead + * + * lock is grabbed before calling this function + */ +static void +runq_tickle(const struct scheduler *ops, struct rt_vcpu *new) +{ + struct rt_private *prv = rt_priv(ops); + struct rt_vcpu *latest_deadline_vcpu = NULL; /* lowest priority scheduled */ + struct rt_vcpu *iter_svc; + struct vcpu *iter_vc; + int cpu = 0, cpu_to_tickle = 0; + cpumask_t not_tickled; + cpumask_t *online; + + if ( new == NULL || is_idle_vcpu(new->vcpu) ) + return; + + online = cpupool_scheduler_cpumask(new->vcpu->domain->cpupool); + cpumask_and(&not_tickled, online, new->vcpu->cpu_hard_affinity); + cpumask_andnot(&not_tickled, &not_tickled, &prv->tickled); + + /* 1) if new's previous cpu is idle, kick it for cache benefit */ + if ( is_idle_vcpu(curr_on_cpu(new->vcpu->processor)) ) + { + cpu_to_tickle = new->vcpu->processor; + goto out; + } + + /* 2) if there are any idle pcpu, kick it */ + /* The same loop also find the one with lowest priority */ + for_each_cpu(cpu, &not_tickled) + { + iter_vc = curr_on_cpu(cpu); + if ( is_idle_vcpu(iter_vc) ) + { + cpu_to_tickle = cpu; + goto out; + } + iter_svc = rt_vcpu(iter_vc); + if ( latest_deadline_vcpu == NULL || + iter_svc->cur_deadline > latest_deadline_vcpu->cur_deadline ) + latest_deadline_vcpu = iter_svc; + } + + /* 3) candicate has higher priority, kick out the lowest priority vcpu */ + if ( latest_deadline_vcpu != NULL && new->cur_deadline < latest_deadline_vcpu->cur_deadline ) + { + cpu_to_tickle = latest_deadline_vcpu->vcpu->processor; + goto out; + } + +out: + /* TRACE */ + { + struct { + unsigned cpu:8, pad:24; + } d; + d.cpu = cpu_to_tickle; + d.pad = 0; + trace_var(TRC_RTDS_TICKLE, 0, + sizeof(d), + (unsigned char *)&d); + } + + cpumask_set_cpu(cpu_to_tickle, &prv->tickled); + cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ); + return; +} + +/* + * Should always wake up runnable vcpu, put it back to RunQ. + * Check priority to raise interrupt + * The lock is already grabbed in schedule.c, no need to lock here + * TODO: what if these two vcpus belongs to the same domain? + */ +static void +rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) +{ + struct rt_vcpu * const svc = rt_vcpu(vc); + s_time_t now = NOW(); + struct rt_private *prv = rt_priv(ops); + struct rt_vcpu *snext = NULL; /* highest priority on RunQ */ + struct rt_dom *sdom = NULL; + cpumask_t *online; + + BUG_ON( is_idle_vcpu(vc) ); + + if ( unlikely(curr_on_cpu(vc->processor) == vc) ) + return; + + /* on RunQ/DepletedQ, just update info is ok */ + if ( unlikely(__vcpu_on_q(svc)) ) + return; + + /* If context hasn't been saved for this vcpu yet, we can't put it on + * the Runqueue/DepletedQ. Instead, we set a flag so that it will be + * put on the Runqueue/DepletedQ after the context has been saved. + */ + if ( unlikely(test_bit(__RTDS_scheduled, &svc->flags)) ) + { + set_bit(__RTDS_delayed_runq_add, &svc->flags); + return; + } + + if ( now >= svc->cur_deadline) + rt_update_deadline(now, svc); + + /* insert svc to runq/depletedq because svc is not in queue now */ + if ( svc->cur_budget > 0 ) + __runq_insert(ops, svc); + else + list_add(&svc->q_elem, &prv->depletedq); + + __repl_update(ops, now); + + ASSERT(!list_empty(&prv->sdom)); + sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem); + online = cpupool_scheduler_cpumask(sdom->dom->cpupool); + snext = __runq_pick(ops, online); /* pick snext from ALL valid cpus */ + + runq_tickle(ops, snext); + + return; +} + +/* + * scurr has finished context switch, insert it back to the RunQ, + * and then pick the highest priority vcpu from runq to run + */ +static void +rt_context_saved(const struct scheduler *ops, struct vcpu *vc) +{ + struct rt_vcpu *svc = rt_vcpu(vc); + struct rt_vcpu *snext = NULL; + struct rt_dom *sdom = NULL; + struct rt_private *prv = rt_priv(ops); + cpumask_t *online; + spinlock_t *lock = vcpu_schedule_lock_irq(vc); + + clear_bit(__RTDS_scheduled, &svc->flags); + /* not insert idle vcpu to runq */ + if ( is_idle_vcpu(vc) ) + goto out; + + if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) && + likely(vcpu_runnable(vc)) ) + { + __runq_insert(ops, svc); + __repl_update(ops, NOW()); + + ASSERT(!list_empty(&prv->sdom)); + sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem); + online = cpupool_scheduler_cpumask(sdom->dom->cpupool); + snext = __runq_pick(ops, online); /* pick snext from ALL cpus */ + + runq_tickle(ops, snext); + } +out: + vcpu_schedule_unlock_irq(lock, vc); +} + +/* + * set/get each vcpu info of each domain + */ +static int +rt_dom_cntl( + const struct scheduler *ops, + struct domain *d, + struct xen_domctl_scheduler_op *op) +{ + struct rt_dom * const sdom = rt_dom(d); + struct rt_vcpu *svc; + struct list_head *iter; + int rc = 0; + + switch ( op->cmd ) + { + case XEN_DOMCTL_SCHEDOP_getinfo: + /* for debug use, whenever adjust Dom0 parameter, do global dump */ + if ( d->domain_id == 0 ) + rt_dump(ops); + + svc = list_entry(sdom->vcpu.next, struct rt_vcpu, sdom_elem); + op->u.rtds.period = svc->period / MICROSECS(1); /* transfer to us */ + op->u.rtds.budget = svc->budget / MICROSECS(1); + break; + case XEN_DOMCTL_SCHEDOP_putinfo: + list_for_each( iter, &sdom->vcpu ) + { + struct rt_vcpu * svc = list_entry(iter, struct rt_vcpu, sdom_elem); + svc->period = MICROSECS(op->u.rtds.period); /* transfer to nanosec */ + svc->budget = MICROSECS(op->u.rtds.budget); + } + break; + } + + return rc; +} + +static struct rt_private _rt_priv; + +const struct scheduler sched_rtds_def = { + .name = "SMP RTDS Scheduler", + .opt_name = "rtds", + .sched_id = XEN_SCHEDULER_RTDS, + .sched_data = &_rt_priv, + + .dump_cpu_state = rt_dump_pcpu, + .dump_settings = rt_dump, + .init = rt_init, + .deinit = rt_deinit, + .alloc_pdata = rt_alloc_pdata, + .free_pdata = rt_free_pdata, + .alloc_domdata = rt_alloc_domdata, + .free_domdata = rt_free_domdata, + .init_domain = rt_dom_init, + .destroy_domain = rt_dom_destroy, + .alloc_vdata = rt_alloc_vdata, + .free_vdata = rt_free_vdata, + .insert_vcpu = rt_vcpu_insert, + .remove_vcpu = rt_vcpu_remove, + + .adjust = rt_dom_cntl, + + .pick_cpu = rt_cpu_pick, + .do_schedule = rt_schedule, + .sleep = rt_vcpu_sleep, + .wake = rt_vcpu_wake, + .context_saved = rt_context_saved, +}; diff --git a/xen/common/schedule.c b/xen/common/schedule.c index 73cc2ea..6285a6e 100644 --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -69,6 +69,7 @@ static const struct scheduler *schedulers[] = { &sched_credit_def, &sched_credit2_def, &sched_arinc653_def, + &sched_rtds_def, }; static struct scheduler __read_mostly ops; diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h index 69a8b44..951289b 100644 --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -347,6 +347,8 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t); #define XEN_SCHEDULER_CREDIT 5 #define XEN_SCHEDULER_CREDIT2 6 #define XEN_SCHEDULER_ARINC653 7 +#define XEN_SCHEDULER_RTDS 8 + /* Set or get info? */ #define XEN_DOMCTL_SCHEDOP_putinfo 0 #define XEN_DOMCTL_SCHEDOP_getinfo 1 @@ -368,6 +370,10 @@ struct xen_domctl_scheduler_op { struct xen_domctl_sched_credit2 { uint16_t weight; } credit2; + struct xen_domctl_sched_rtds{ + uint32_t period; + uint32_t budget; + } rtds; } u; }; typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t; diff --git a/xen/include/public/trace.h b/xen/include/public/trace.h index cfcf4aa..5211ae7 100644 --- a/xen/include/public/trace.h +++ b/xen/include/public/trace.h @@ -77,6 +77,7 @@ #define TRC_SCHED_CSCHED2 1 #define TRC_SCHED_SEDF 2 #define TRC_SCHED_ARINC653 3 +#define TRC_SCHED_RTDS 4 /* Per-scheduler tracing */ #define TRC_SCHED_CLASS_EVT(_c, _e) \ diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h index 4164dff..7cc25c6 100644 --- a/xen/include/xen/sched-if.h +++ b/xen/include/xen/sched-if.h @@ -169,6 +169,7 @@ extern const struct scheduler sched_sedf_def; extern const struct scheduler sched_credit_def; extern const struct scheduler sched_credit2_def; extern const struct scheduler sched_arinc653_def; +extern const struct scheduler sched_rtds_def; struct cpupool -- 1.7.9.5 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.