[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH RFC v1 1/4] rt: Add rt scheduler to hypervisor



This is the core rt scheduler patch. It adds the new real time scheduler to
the hypervisor, as the non-default scheduler.

This scheduler follows the pre-emptive Global EDF theory in real-time field.
Each VCPU can have a dedicated period and budget.
While scheduled, a VCPU burns its budget.
A VCPU has its budget replenished at the beginning of each of its periods;
The VCPU discards its unused budget at the end of each of its periods.
If a VCPU runs out of budget in a period, it has to wait until next period.
The mechanism of how to burn a VCPU's budget depends on the server mechanism
implemented for each VCPU.

Server mechanism: a VCPU is implemented as a deferable server.
When a VCPU has a task running on it, its budget is continuously
burned;
When a VCPU has no task but with budget left, its budget is
preserved.

Priority scheme: Preemptive Global Earliest Deadline First (gEDF).
At any scheduling point, the VCPU with earliest deadline has highest
priority.

Queue scheme: A global runqueue for each CPU pool.
The runqueue holds all runnable VCPUs.
VCPUs in the runqueue are divided into two parts: with and without budget.
At each part, VCPUs are sorted based on EDF priority scheme.

Scheduling quanta: 1 ms; but accounting the budget is in microsecond.

Note: cpumask and cpupool is supported.

This is still in the development phase.

Signed-off-by: Sisu Xi <xisisu@xxxxxxxxx>
Signed-off-by: Meng Xu <mengxu@xxxxxxxxxxxxx>
---
 xen/common/Makefile         |    1 +
 xen/common/sched_rt.c       |  984 +++++++++++++++++++++++++++++++++++++++++++
 xen/common/schedule.c       |    1 +
 xen/include/public/domctl.h |   19 +
 xen/include/xen/sched-if.h  |    2 +-
 5 files changed, 1006 insertions(+), 1 deletion(-)
 create mode 100644 xen/common/sched_rt.c

diff --git a/xen/common/Makefile b/xen/common/Makefile
index 3683ae3..5a23aa4 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -26,6 +26,7 @@ obj-y += sched_credit.o
 obj-y += sched_credit2.o
 obj-y += sched_sedf.o
 obj-y += sched_arinc653.o
+obj-y += sched_rt.o
 obj-y += schedule.o
 obj-y += shutdown.o
 obj-y += softirq.o
diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
new file mode 100644
index 0000000..41543a2
--- /dev/null
+++ b/xen/common/sched_rt.c
@@ -0,0 +1,984 @@
+/******************************************************************************
+ * Preemptive Global Earliest Deadline First  (EDF) scheduler for Xen
+ * EDF scheduling is one of most popular real-time scheduling algorithm used in
+ * embedded field.
+ *
+ * by Sisu Xi, 2013, Washington University in Saint Louis
+ * and Meng Xu, 2014, University of Pennsylvania
+ *
+ * based on the code of credit Scheduler
+ */
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/domain.h>
+#include <xen/delay.h>
+#include <xen/event.h>
+#include <xen/time.h>
+#include <xen/perfc.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+#include <asm/atomic.h>
+#include <xen/errno.h>
+#include <xen/trace.h>
+#include <xen/cpu.h>
+#include <xen/keyhandler.h>
+#include <xen/trace.h>
+#include <xen/guest_access.h>
+
+/*
+ * TODO:
+ *
+ * Migration compensation and resist like credit2 to better use cache;
+ * Lock Holder Problem, using yield?
+ * Self switch problem: VCPUs of the same domain may preempt each other;
+ */
+
+/*
+ * Design:
+ *
+ * This scheduler follows the Preemptive Global EDF theory in real-time field.
+ * Each VCPU can have a dedicated period and budget. 
+ * While scheduled, a VCPU burns its budget.
+ * A VCPU has its budget replenished at the beginning of each of its periods;
+ * The VCPU discards its unused budget at the end of each of its periods.
+ * If a VCPU runs out of budget in a period, it has to wait until next period.
+ * The mechanism of how to burn a VCPU's budget depends on the server mechanism
+ * implemented for each VCPU.
+ *
+ * Server mechanism: a VCPU is implemented as a deferable server.
+ * When a VCPU has a task running on it, its budget is continuously burned;
+ * When a VCPU has no task but with budget left, its budget is preserved.
+ *
+ * Priority scheme: Preemptive Global Earliest Deadline First (gEDF).
+ * At any scheduling point, the VCPU with earliest deadline has highest 
priority.
+ *
+ * Queue scheme: A global runqueue for each CPU pool. 
+ * The runqueue holds all runnable VCPUs. 
+ * VCPUs in the runqueue are divided into two parts: with and without 
remaining budget. 
+ * At each part, VCPUs are sorted based on EDF priority scheme.
+ *
+ * Scheduling quanta: 1 ms; but accounting the budget is in microsecond.
+ *
+ * Note: cpumask and cpupool is supported.
+ */
+
+/*
+ * Locking:
+ * Just like credit2, a global system lock is used to protect the RunQ.
+ * The global lock is referenced by schedule_data.schedule_lock from all 
physical cpus.
+ *
+ * The lock is already grabbed when calling wake/sleep/schedule/ functions in 
schedule.c
+ *
+ * The functions involes RunQ and needs to grab locks are:
+ *    dump, vcpu_insert, vcpu_remove, context_saved,
+ */
+
+
+/*
+ * Default parameters in ms
+ */
+#define RT_DEFAULT_PERIOD     10
+#define RT_DEFAULT_BUDGET      4
+
+/*
+ * Useful macros
+ */
+#define RT_PRIV(_ops)     ((struct rt_private *)((_ops)->sched_data))
+#define RT_VCPU(_vcpu)    ((struct rt_vcpu *)(_vcpu)->sched_priv)
+#define RT_DOM(_dom)      ((struct rt_dom *)(_dom)->sched_priv)
+#define RUNQ(_ops)              (&RT_PRIV(_ops)->runq)
+
+/*
+ * Flags
+ */
+#define __RT_scheduled            1
+#define RT_scheduled (1<<__RT_scheduled)
+#define __RT_delayed_runq_add     2
+#define RT_delayed_runq_add (1<<__RT_delayed_runq_add)
+
+/*
+ * Used to printout debug information
+ */
+#define printtime()     ( printk("%d : %3ld.%3ld : %-19s ", 
smp_processor_id(), NOW()/MILLISECS(1), NOW()%MILLISECS(1)/1000, __func__) )
+
+/*
+ * Systme-wide private data, include a global RunQueue
+ * The global lock is referenced by schedule_data.schedule_lock from all 
physical cpus.
+ * It can be grabbed via vcpu_schedule_lock_irq()
+ */
+struct rt_private {
+    spinlock_t lock;        /* The global coarse grand lock */
+    struct list_head sdom;  /* list of availalbe domains, used for dump */
+    struct list_head runq;  /* Ordered list of runnable VMs */
+    cpumask_t cpus;         /* cpumask_t of available physical cpus */
+    cpumask_t tickled;      /* another cpu in the queue already ticked this 
one */
+};
+
+/*
+ * Virtual CPU
+ */
+struct rt_vcpu {
+    struct list_head runq_elem; /* On the runqueue list */
+    struct list_head sdom_elem; /* On the domain VCPU list */
+
+    /* Up-pointers */
+    struct rt_dom *sdom;
+    struct vcpu *vcpu;
+
+    /* VCPU parameters, in milliseconds */
+    s_time_t period;
+    s_time_t budget;
+
+    /* VCPU current infomation */
+    long cur_budget;             /* current budget in microseconds */
+    s_time_t last_start;        /* last start time, used to calculate budget */
+    s_time_t cur_deadline;      /* current deadline, used to do EDF */
+    unsigned flags;             /* mark __RT_scheduled, etc.. */
+};
+
+/*
+ * Domain
+ */
+struct rt_dom {
+    struct list_head vcpu;      /* link its VCPUs */
+    struct list_head sdom_elem; /* link list on rt_priv */
+    struct domain *dom;         /* pointer to upper domain */
+};
+
+/*
+ * RunQueue helper functions
+ */
+static int
+__vcpu_on_runq(struct rt_vcpu *svc)
+{
+   return !list_empty(&svc->runq_elem);
+}
+
+static struct rt_vcpu *
+__runq_elem(struct list_head *elem)
+{
+    return list_entry(elem, struct rt_vcpu, runq_elem);
+}
+
+static inline void
+__runq_remove(struct rt_vcpu *svc)
+{
+    if ( __vcpu_on_runq(svc) )
+        list_del_init(&svc->runq_elem);
+}
+
+/*
+ * Insert a vcpu in the RunQ basing on vcpu's deadline: 
+ * vcpus with shorter deadline are inserted first.
+ * EDF schedule policy: vcpu with smaller deadline has higher priority;
+ * When vcpus have the same deadline, insert the current one at the head of 
these vcpus.
+ */
+static void
+__runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
+{
+    struct list_head *runq = RUNQ(ops);
+    struct list_head *iter;
+   
+    ASSERT( spin_is_locked(per_cpu(schedule_data, 
svc->vcpu->processor).schedule_lock) );
+
+    if ( __vcpu_on_runq(svc) )
+        return;
+
+    list_for_each(iter, runq) {
+        struct rt_vcpu * iter_svc = __runq_elem(iter);
+
+        if ( svc->cur_budget > 0 ) { /* svc still has budget */
+            if ( iter_svc->cur_budget == 0 ||
+                 svc->cur_deadline <= iter_svc->cur_deadline )
+                    break;
+        } else { /* svc has no budget */
+            if ( iter_svc->cur_budget == 0 &&
+                 svc->cur_deadline <= iter_svc->cur_deadline )
+                    break;
+        }
+    }
+
+    list_add_tail(&svc->runq_elem, iter);
+}
+
+
+/*
+ * Debug related code, dump vcpu/cpu information
+ */
+static void
+rt_dump_vcpu(struct rt_vcpu *svc)
+{
+    if ( svc == NULL ) {
+        printk("NULL!\n");
+        return;
+    }
+#define cpustr keyhandler_scratch
+    cpumask_scnprintf(cpustr, sizeof(cpustr), svc->vcpu->cpu_hard_affinity);
+    printk("[%5d.%-2d] cpu %d, (%"PRId64", %"PRId64"), cur_b=%"PRId64" 
cur_d=%"PRId64" last_start=%"PRId64" onR=%d runnable=%d cpu_hard_affinity=%s ",
+            svc->vcpu->domain->domain_id,
+            svc->vcpu->vcpu_id,
+            svc->vcpu->processor,
+            svc->period,
+            svc->budget,
+            svc->cur_budget,
+            svc->cur_deadline,
+            svc->last_start,
+            __vcpu_on_runq(svc),
+            vcpu_runnable(svc->vcpu),
+            cpustr);
+    memset(cpustr, 0, sizeof(char)*1024);
+    cpumask_scnprintf(cpustr, sizeof(cpustr), 
cpupool_scheduler_cpumask(svc->vcpu->domain->cpupool));
+    printk("cpupool=%s\n", cpustr);
+#undef cpustr
+}
+
+static void
+rt_dump_pcpu(const struct scheduler *ops, int cpu)
+{
+    struct rt_vcpu *svc = RT_VCPU(curr_on_cpu(cpu));
+
+    printtime();
+    rt_dump_vcpu(svc);
+}
+
+/*
+ * should not need lock here. only showing stuff 
+ */
+static void
+rt_dump(const struct scheduler *ops)
+{
+    struct list_head *iter_sdom, *iter_svc, *runq, *iter;
+    struct rt_private *prv = RT_PRIV(ops);
+    struct rt_vcpu *svc;
+    int cpu = 0;
+    int loop = 0;
+
+    printtime();
+    printk("Priority Scheme: EDF\n");
+
+    printk("PCPU info: \n");
+    for_each_cpu(cpu, &prv->cpus) {
+        rt_dump_pcpu(ops, cpu);
+    }
+
+    printk("Global RunQueue info: \n");
+    loop = 0;
+    runq = RUNQ(ops);
+    list_for_each( iter, runq ) {
+        svc = __runq_elem(iter);
+        printk("\t%3d: ", ++loop);
+        rt_dump_vcpu(svc);
+    }
+
+    printk("Domain info: \n");
+    loop = 0;
+    list_for_each( iter_sdom, &prv->sdom ) {
+        struct rt_dom *sdom;
+        sdom = list_entry(iter_sdom, struct rt_dom, sdom_elem);
+        printk("\tdomain: %d\n", sdom->dom->domain_id);
+
+        list_for_each( iter_svc, &sdom->vcpu ) {
+            svc = list_entry(iter_svc, struct rt_vcpu, sdom_elem);
+            printk("\t\t%3d: ", ++loop);
+            rt_dump_vcpu(svc);
+        }
+    }
+
+    printk("\n");
+}
+
+/*
+ * Init/Free related code
+ */
+static int
+rt_init(struct scheduler *ops)
+{
+    struct rt_private *prv;
+
+    prv = xzalloc(struct rt_private);
+    if ( prv == NULL ) {
+        printk("xzalloc failed at rt_private\n");
+        return -ENOMEM;
+    }
+
+    ops->sched_data = prv;
+    spin_lock_init(&prv->lock);
+    INIT_LIST_HEAD(&prv->sdom);
+    INIT_LIST_HEAD(&prv->runq);
+    cpumask_clear(&prv->tickled);
+    cpumask_clear(&prv->cpus);
+
+    printtime();
+    printk("\n");
+
+    return 0;
+}
+
+static void
+rt_deinit(const struct scheduler *ops)
+{
+    struct rt_private *prv;
+
+    printtime();
+    printk("\n");
+
+    prv = RT_PRIV(ops);
+    if ( prv )
+        xfree(prv);
+}
+
+/* 
+ * point per_cpu spinlock to the global system lock; all cpu have same global 
system lock 
+ */
+static void *
+rt_alloc_pdata(const struct scheduler *ops, int cpu)
+{
+    struct rt_private *prv = RT_PRIV(ops);
+
+    cpumask_set_cpu(cpu, &prv->cpus);
+
+    per_cpu(schedule_data, cpu).schedule_lock = &prv->lock;
+
+    printtime();
+    printk("%s total cpus: %d", __FUNCTION__, cpumask_weight(&prv->cpus));
+    return (void *)1;
+}
+
+static void
+rt_free_pdata(const struct scheduler *ops, void *pcpu, int cpu)
+{
+    struct rt_private * prv = RT_PRIV(ops);
+    cpumask_clear_cpu(cpu, &prv->cpus);
+    printtime();
+    printk("%s cpu=%d\n", __FUNCTION__, cpu);
+}
+
+static void *
+rt_alloc_domdata(const struct scheduler *ops, struct domain *dom)
+{
+    unsigned long flags;
+    struct rt_dom *sdom;
+    struct rt_private * prv = RT_PRIV(ops);
+
+    printtime();
+    printk("dom=%d\n", dom->domain_id);
+
+    sdom = xzalloc(struct rt_dom);
+    if ( sdom == NULL ) {
+        printk("%s, xzalloc failed\n", __func__);
+        return NULL;
+    }
+
+    INIT_LIST_HEAD(&sdom->vcpu);
+    INIT_LIST_HEAD(&sdom->sdom_elem);
+    sdom->dom = dom;
+
+    /* spinlock here to insert the dom */
+    spin_lock_irqsave(&prv->lock, flags);
+    list_add_tail(&sdom->sdom_elem, &(prv->sdom));
+    spin_unlock_irqrestore(&prv->lock, flags);
+
+    return (void *)sdom;
+}
+
+static void
+rt_free_domdata(const struct scheduler *ops, void *data)
+{
+    unsigned long flags;
+    struct rt_dom *sdom = data;
+    struct rt_private * prv = RT_PRIV(ops);
+
+    printtime();
+    printk("dom=%d\n", sdom->dom->domain_id);
+
+    spin_lock_irqsave(&prv->lock, flags);
+    list_del_init(&sdom->sdom_elem);
+    spin_unlock_irqrestore(&prv->lock, flags);
+    xfree(data);
+}
+
+static int
+rt_dom_init(const struct scheduler *ops, struct domain *dom)
+{
+    struct rt_dom *sdom;
+
+    printtime();
+    printk("dom=%d\n", dom->domain_id);
+
+    /* IDLE Domain does not link on rt_private */
+    if ( is_idle_domain(dom) ) { return 0; }
+
+    sdom = rt_alloc_domdata(ops, dom);
+    if ( sdom == NULL ) {
+        printk("%s, failed\n", __func__);
+        return -ENOMEM;
+    }
+    dom->sched_priv = sdom;
+
+    return 0;
+}
+
+static void
+rt_dom_destroy(const struct scheduler *ops, struct domain *dom)
+{
+    printtime();
+    printk("dom=%d\n", dom->domain_id);
+
+    rt_free_domdata(ops, RT_DOM(dom));
+}
+
+static void *
+rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
+{
+    struct rt_vcpu *svc;
+    s_time_t now = NOW();
+    long count;
+
+    /* Allocate per-VCPU info */
+    svc = xzalloc(struct rt_vcpu);
+    if ( svc == NULL ) {
+        printk("%s, xzalloc failed\n", __func__);
+        return NULL;
+    }
+
+    INIT_LIST_HEAD(&svc->runq_elem);
+    INIT_LIST_HEAD(&svc->sdom_elem);
+    svc->flags = 0U;
+    svc->sdom = dd;
+    svc->vcpu = vc;
+    svc->last_start = 0;            /* init last_start is 0 */
+
+    svc->period = RT_DEFAULT_PERIOD;
+    if ( !is_idle_vcpu(vc) && vc->domain->domain_id != 0 ) {
+        svc->budget = RT_DEFAULT_BUDGET;
+    } else {
+        svc->budget = RT_DEFAULT_PERIOD; /* give vcpus of dom0 100% 
utilization */
+    }
+
+    count = (now/MILLISECS(svc->period)) + 1;
+    /* sync all VCPU's start time to 0 */
+    svc->cur_deadline += count*MILLISECS(svc->period);
+
+    svc->cur_budget = svc->budget*1000; /* counting in microseconds level */
+    /* Debug only: dump new vcpu's info */
+    printtime();
+    rt_dump_vcpu(svc);
+
+    return svc;
+}
+
+static void
+rt_free_vdata(const struct scheduler *ops, void *priv)
+{
+    struct rt_vcpu *svc = priv;
+
+    /* Debug only: dump freed vcpu's info */
+    printtime();
+    rt_dump_vcpu(svc);
+    xfree(svc);
+}
+
+static void
+rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
+{
+    struct rt_vcpu *svc = RT_VCPU(vc);
+
+    /* Debug only: dump info of vcpu to insert */
+    printtime();
+    rt_dump_vcpu(svc);
+
+    /* IDLE VCPU not allowed on RunQ */
+    if ( is_idle_vcpu(vc) )
+        return;
+
+    list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);   /* add to dom vcpu 
list */
+}
+
+static void
+rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
+{
+    struct rt_vcpu * const svc = RT_VCPU(vc);
+    struct rt_dom * const sdom = svc->sdom;
+
+    printtime();
+    rt_dump_vcpu(svc);
+
+    BUG_ON( sdom == NULL );
+    BUG_ON( __vcpu_on_runq(svc) );
+
+    if ( !is_idle_vcpu(vc) ) {
+        list_del_init(&svc->sdom_elem);
+    }
+}
+
+/* 
+ * Pick a valid CPU for the vcpu vc
+ * Valid CPU of a vcpu is intesection of vcpu's affinity and available cpus
+ */
+static int
+rt_cpu_pick(const struct scheduler *ops, struct vcpu *vc)
+{
+    cpumask_t cpus;
+    cpumask_t *online;
+    int cpu;
+    struct rt_private * prv = RT_PRIV(ops);
+
+    online = cpupool_scheduler_cpumask(vc->domain->cpupool);
+    cpumask_and(&cpus, &prv->cpus, online);
+    cpumask_and(&cpus, &cpus, vc->cpu_hard_affinity);
+
+    cpu = cpumask_test_cpu(vc->processor, &cpus)
+            ? vc->processor 
+            : cpumask_cycle(vc->processor, &cpus);
+    ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) );
+
+    return cpu;
+}
+
+/*
+ * Implemented as deferrable server. 
+ * Different server mechanism has different implementation.
+ * burn budget at microsecond level. 
+ */
+static void
+burn_budgets(const struct scheduler *ops, struct rt_vcpu *svc, s_time_t now) {
+    s_time_t delta;
+    unsigned int consume;
+    long count = 0;
+
+    /* first time called for this svc, update last_start */
+    if ( svc->last_start == 0 ) {
+        svc->last_start = now;
+        return;
+    }
+
+    /* don't burn budget for idle VCPU */
+    if ( is_idle_vcpu(svc->vcpu) ) {
+        return;
+    }
+
+    /* don't burn budget for Domain-0, RT-Xen use only */
+    if ( svc->sdom->dom->domain_id == 0 ) {
+        return;
+    }
+
+    /* update deadline info */
+    delta = now - svc->cur_deadline;
+    if ( delta >= 0 ) {
+        count = ( delta/MILLISECS(svc->period) ) + 1;
+        svc->cur_deadline += count * MILLISECS(svc->period);
+        svc->cur_budget = svc->budget * 1000;
+        return;
+    }
+
+    delta = now - svc->last_start;
+    if ( delta < 0 ) {
+        printk("%s, delta = %ld for ", __func__, delta);
+        rt_dump_vcpu(svc);
+        svc->last_start = now;  /* update last_start */
+        svc->cur_budget = 0;
+        return;
+    }
+
+    if ( svc->cur_budget == 0 ) return;
+
+    /* burn at microseconds level */
+    consume = ( delta/MICROSECS(1) );
+    if ( delta%MICROSECS(1) > MICROSECS(1)/2 ) consume++;
+
+    svc->cur_budget -= consume;
+    if ( svc->cur_budget < 0 ) svc->cur_budget = 0;
+}
+
+/* 
+ * RunQ is sorted. Pick first one within cpumask. If no one, return NULL
+ * lock is grabbed before calling this function 
+ */
+static struct rt_vcpu *
+__runq_pick(const struct scheduler *ops, cpumask_t mask)
+{
+    struct list_head *runq = RUNQ(ops);
+    struct list_head *iter;
+    struct rt_vcpu *svc = NULL;
+    struct rt_vcpu *iter_svc = NULL;
+    cpumask_t cpu_common;
+    cpumask_t *online;
+    struct rt_private * prv = RT_PRIV(ops);
+
+    list_for_each(iter, runq) {
+        iter_svc = __runq_elem(iter);
+
+        /* mask is intersection of cpu_hard_affinity and cpupool and 
priv->cpus */
+        online = cpupool_scheduler_cpumask(iter_svc->vcpu->domain->cpupool);
+        cpumask_and(&cpu_common, online, &prv->cpus);
+        cpumask_and(&cpu_common, &cpu_common, 
iter_svc->vcpu->cpu_hard_affinity);
+        cpumask_and(&cpu_common, &mask, &cpu_common);
+        if ( cpumask_empty(&cpu_common) )
+            continue;
+
+        if ( iter_svc->cur_budget <= 0 )
+            continue;
+
+        svc = iter_svc;
+        break;
+    }
+
+    return svc;
+}
+
+/*
+ * Update vcpu's budget and sort runq by insert the modifed vcpu back to runq
+ * lock is grabbed before calling this function 
+ */
+static void
+__repl_update(const struct scheduler *ops, s_time_t now)
+{
+    struct list_head *runq = RUNQ(ops);
+    struct list_head *iter;
+    struct list_head *tmp;
+    struct rt_vcpu *svc = NULL;
+
+    s_time_t diff;
+    long count;
+
+    list_for_each_safe(iter, tmp, runq) {
+        svc = __runq_elem(iter);
+
+        diff = now - svc->cur_deadline;
+        if ( diff > 0 ) {
+            count = (diff/MILLISECS(svc->period)) + 1;
+            svc->cur_deadline += count * MILLISECS(svc->period);
+            svc->cur_budget = svc->budget * 1000;
+            __runq_remove(svc);
+            __runq_insert(ops, svc);
+        }
+    }
+}
+
+/* 
+ * schedule function for rt scheduler.
+ * The lock is already grabbed in schedule.c, no need to lock here 
+ */
+static struct task_slice
+rt_schedule(const struct scheduler *ops, s_time_t now, bool_t 
tasklet_work_scheduled)
+{
+    const int cpu = smp_processor_id();
+    struct rt_private * prv = RT_PRIV(ops);
+    struct rt_vcpu * const scurr = RT_VCPU(current);
+    struct rt_vcpu * snext = NULL;
+    struct task_slice ret;
+
+    /* clear ticked bit now that we've been scheduled */
+    if ( cpumask_test_cpu(cpu, &prv->tickled) )
+        cpumask_clear_cpu(cpu, &prv->tickled);
+
+    /* burn_budget would return for IDLE VCPU */
+    burn_budgets(ops, scurr, now);
+
+    __repl_update(ops, now);
+
+    if ( tasklet_work_scheduled ) {
+        snext = RT_VCPU(idle_vcpu[cpu]);
+    } else {
+        cpumask_t cur_cpu;
+        cpumask_clear(&cur_cpu);
+        cpumask_set_cpu(cpu, &cur_cpu);
+        snext = __runq_pick(ops, cur_cpu);
+        if ( snext == NULL )
+            snext = RT_VCPU(idle_vcpu[cpu]);
+
+        /* if scurr has higher priority and budget, still pick scurr */
+        if ( !is_idle_vcpu(current) &&
+             vcpu_runnable(current) &&
+             scurr->cur_budget > 0 &&
+             ( is_idle_vcpu(snext->vcpu) ||
+               scurr->cur_deadline <= snext->cur_deadline ) ) {
+            snext = scurr;
+        }
+    }
+
+    if ( snext != scurr &&
+         !is_idle_vcpu(current) &&
+         vcpu_runnable(current) ) {
+        set_bit(__RT_delayed_runq_add, &scurr->flags);
+    }
+
+    snext->last_start = now;
+    ret.migrated = 0;
+    if ( !is_idle_vcpu(snext->vcpu) ) {
+        if ( snext != scurr ) {
+            __runq_remove(snext);
+            set_bit(__RT_scheduled, &snext->flags);
+        }
+        if ( snext->vcpu->processor != cpu ) {
+            snext->vcpu->processor = cpu;
+            ret.migrated = 1;
+        }
+    }
+
+    ret.time = MILLISECS(1);
+    ret.task = snext->vcpu;
+
+    return ret;
+}
+
+/*
+ * Remove VCPU from RunQ
+ * The lock is already grabbed in schedule.c, no need to lock here 
+ */
+static void
+rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
+{
+    struct rt_vcpu * const svc = RT_VCPU(vc);
+
+    BUG_ON( is_idle_vcpu(vc) );
+
+    if ( curr_on_cpu(vc->processor) == vc ) {
+        cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
+        return;
+    }
+
+    if ( __vcpu_on_runq(svc) ) {
+        __runq_remove(svc);
+    }
+
+    clear_bit(__RT_delayed_runq_add, &svc->flags);
+}
+
+/*
+ * Pick a vcpu on a cpu to kick out to place the running candidate
+ * Called by wake() and context_saved()
+ * We have a running candidate here, the kick logic is:
+ * Among all the cpus that are within the cpu affinity
+ * 1) if the new->cpu is idle, kick it. This could benefit cache hit
+ * 2) if there are any idle vcpu, kick it.
+ * 3) now all pcpus are busy, among all the running vcpus, pick lowest 
priority one
+ *    if snext has higher priority, kick it.
+ *
+ * TODO:
+ * 1) what if these two vcpus belongs to the same domain?
+ *    replace a vcpu belonging to the same domain introduces more overhead
+ *
+ * lock is grabbed before calling this function 
+ */
+static void
+runq_tickle(const struct scheduler *ops, struct rt_vcpu *new)
+{
+    struct rt_private * prv = RT_PRIV(ops);
+    struct rt_vcpu * scheduled = NULL;    /* lowest priority scheduled */
+    struct rt_vcpu * iter_svc;
+    struct vcpu * iter_vc;
+    int cpu = 0;
+    cpumask_t not_tickled;                      /* not tickled cpus */
+    cpumask_t *online;
+
+    if ( new == NULL || is_idle_vcpu(new->vcpu) ) return;
+
+    online = cpupool_scheduler_cpumask(new->vcpu->domain->cpupool);
+    cpumask_and(&not_tickled, online, &prv->cpus);
+    cpumask_and(&not_tickled, &not_tickled, new->vcpu->cpu_hard_affinity);
+    cpumask_andnot(&not_tickled, &not_tickled, &prv->tickled);
+
+    /* 1) if new's previous cpu is idle, kick it for cache benefit */
+    if ( is_idle_vcpu(curr_on_cpu(new->vcpu->processor)) ) {
+        cpumask_set_cpu(new->vcpu->processor, &prv->tickled);
+        cpu_raise_softirq(new->vcpu->processor, SCHEDULE_SOFTIRQ);
+        return;
+    }
+
+    /* 2) if there are any idle pcpu, kick it */
+    /* The same loop also find the one with lowest priority */
+    for_each_cpu(cpu, &not_tickled) {
+        iter_vc = curr_on_cpu(cpu);
+        if ( is_idle_vcpu(iter_vc) ) {
+            cpumask_set_cpu(cpu, &prv->tickled);
+            cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+            return;
+        }
+        iter_svc = RT_VCPU(iter_vc);
+        if ( scheduled == NULL || 
+             iter_svc->cur_deadline > scheduled->cur_deadline ) {
+            scheduled = iter_svc;
+        }
+    }
+
+    /* 3) candicate has higher priority, kick out the lowest priority vcpu */
+    if ( scheduled != NULL && new->cur_deadline < scheduled->cur_deadline ) {
+        cpumask_set_cpu(scheduled->vcpu->processor, &prv->tickled);
+        cpu_raise_softirq(scheduled->vcpu->processor, SCHEDULE_SOFTIRQ);
+    }
+    return;
+}
+
+/* 
+ * Should always wake up runnable vcpu, put it back to RunQ. 
+ * Check priority to raise interrupt 
+ * The lock is already grabbed in schedule.c, no need to lock here 
+ * TODO: what if these two vcpus belongs to the same domain? 
+ */
+static void
+rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
+{
+    struct rt_vcpu * const svc = RT_VCPU(vc);
+    s_time_t diff;
+    s_time_t now = NOW();
+    long count = 0;
+    struct rt_private * prv = RT_PRIV(ops);
+    struct rt_vcpu * snext = NULL;        /* highest priority on RunQ */
+
+    BUG_ON( is_idle_vcpu(vc) );
+
+    if ( unlikely(curr_on_cpu(vc->processor) == vc) ) return;
+
+    /* on RunQ, just update info is ok */
+    if ( unlikely(__vcpu_on_runq(svc)) ) return;
+
+    /* If context hasn't been saved yet, set flag so it will add later */
+    if ( unlikely(test_bit(__RT_scheduled, &svc->flags)) ) {
+        set_bit(__RT_delayed_runq_add, &svc->flags);
+        return;
+    }
+
+    /* update deadline info */
+    diff = now - svc->cur_deadline;
+    if ( diff >= 0 ) {
+        count = ( diff/MILLISECS(svc->period) ) + 1;
+        svc->cur_deadline += count * MILLISECS(svc->period);
+        svc->cur_budget = svc->budget * 1000;
+    }
+
+    __runq_insert(ops, svc);
+    __repl_update(ops, now);
+    snext = __runq_pick(ops, prv->cpus);    /* pick snext from ALL valid cpus 
*/
+    runq_tickle(ops, snext);
+
+    return;
+}
+
+/* 
+ * scurr has finished context switch, insert it back to the RunQ,
+ * and then pick the highest priority vcpu from runq to run 
+ */
+static void
+rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
+{
+    struct rt_vcpu * svc = RT_VCPU(vc);
+    struct rt_vcpu * snext = NULL;
+    struct rt_private * prv = RT_PRIV(ops);
+    spinlock_t *lock;
+
+    clear_bit(__RT_scheduled, &svc->flags);
+    if ( is_idle_vcpu(vc) ) return;
+
+    lock = vcpu_schedule_lock_irq(vc);
+    if ( test_and_clear_bit(__RT_delayed_runq_add, &svc->flags) && 
+         likely(vcpu_runnable(vc)) ) {
+        __runq_insert(ops, svc);
+        __repl_update(ops, NOW());
+        snext = __runq_pick(ops, prv->cpus);    /* pick snext from ALL cpus */
+        runq_tickle(ops, snext);
+    }
+    vcpu_schedule_unlock_irq(lock, vc);
+}
+
+/*
+ * set/get each vcpu info of each domain
+ */
+static int
+rt_dom_cntl(const struct scheduler *ops, struct domain *d, struct 
xen_domctl_scheduler_op *op)
+{
+    xen_domctl_sched_rt_params_t local_sched;
+    struct rt_dom * const sdom = RT_DOM(d);
+    struct list_head *iter;
+    int vcpu_index = 0;
+    int rc = -EINVAL;
+
+    switch ( op->cmd )
+    {
+    case XEN_DOMCTL_SCHEDOP_getinfo:
+        /* for debug use, whenever adjust Dom0 parameter, do global dump */
+        if ( d->domain_id == 0 ) {
+            rt_dump(ops);
+        }
+
+        local_sched.num_vcpus = 0;
+        list_for_each( iter, &sdom->vcpu ) {
+            struct rt_vcpu * svc = list_entry(iter, struct rt_vcpu, sdom_elem);
+
+            ASSERT(vcpu_index < XEN_LEGACY_MAX_VCPUS);
+            local_sched.vcpus[vcpu_index].budget = svc->budget;
+            local_sched.vcpus[vcpu_index].period = svc->period;
+
+            vcpu_index++;
+        }
+        local_sched.num_vcpus = vcpu_index;
+        copy_to_guest(op->u.rt.schedule, &local_sched, 1);
+        rc = 0;
+        break;
+    case XEN_DOMCTL_SCHEDOP_putinfo:
+        copy_from_guest(&local_sched, op->u.rt.schedule, 1);
+        list_for_each( iter, &sdom->vcpu ) {
+            struct rt_vcpu * svc = list_entry(iter, struct rt_vcpu, sdom_elem);
+
+            if ( local_sched.vcpu_index == svc->vcpu->vcpu_id ) { /* adjust 
per VCPU parameter */
+                vcpu_index = local_sched.vcpu_index;
+
+                if ( vcpu_index < 0 || vcpu_index > XEN_LEGACY_MAX_VCPUS) {
+                    printk("XEN_DOMCTL_SCHEDOP_putinfo: vcpu_index=%d\n",
+                            vcpu_index);
+                }else{
+                    printk("XEN_DOMCTL_SCHEDOP_putinfo: vcpu_index=%d, 
period=%"PRId64", budget=%"PRId64"\n",
+                            vcpu_index, local_sched.vcpus[vcpu_index].period, 
+                            local_sched.vcpus[vcpu_index].budget);
+                }
+
+                svc->period = local_sched.vcpus[vcpu_index].period;
+                svc->budget = local_sched.vcpus[vcpu_index].budget;
+
+                break;
+            }
+        }
+        rc = 0;
+        break;
+    }
+
+    return rc;
+}
+
+static struct rt_private _rt_priv;
+
+const struct scheduler sched_rt_def = {
+    .name           = "SMP RT Scheduler",
+    .opt_name       = "rt",
+    .sched_id       = XEN_SCHEDULER_RT,
+    .sched_data     = &_rt_priv,
+
+    .dump_cpu_state = rt_dump_pcpu,
+    .dump_settings  = rt_dump,
+    .init           = rt_init,
+    .deinit         = rt_deinit,
+    .alloc_pdata    = rt_alloc_pdata,
+    .free_pdata     = rt_free_pdata,
+    .alloc_domdata  = rt_alloc_domdata,
+    .free_domdata   = rt_free_domdata,
+    .init_domain    = rt_dom_init,
+    .destroy_domain = rt_dom_destroy,
+    .alloc_vdata    = rt_alloc_vdata,
+    .free_vdata     = rt_free_vdata,
+    .insert_vcpu    = rt_vcpu_insert,
+    .remove_vcpu    = rt_vcpu_remove,
+
+    .adjust         = rt_dom_cntl,
+
+    .pick_cpu       = rt_cpu_pick,
+    .do_schedule    = rt_schedule,
+    .sleep          = rt_vcpu_sleep,
+    .wake           = rt_vcpu_wake,
+    .context_saved  = rt_context_saved,
+
+    .yield          = NULL,
+    .migrate        = NULL,
+};
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index e9eb0bc..2d13966 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -68,6 +68,7 @@ static const struct scheduler *schedulers[] = {
     &sched_sedf_def,
     &sched_credit_def,
     &sched_credit2_def,
+    &sched_rt_def,
     &sched_arinc653_def,
 };
 
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 5b11bbf..d1a8201 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -339,6 +339,20 @@ struct xen_domctl_max_vcpus {
 typedef struct xen_domctl_max_vcpus xen_domctl_max_vcpus_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t);
 
+/*
+ * This structure is used to pass to rt scheduler from a 
+ * privileged domain to Xen
+ */
+struct xen_domctl_sched_rt_params {
+    struct {
+        signed long period; /* s_time_t type */
+        signed long budget; 
+    } vcpus[XEN_LEGACY_MAX_VCPUS];
+    uint16_t num_vcpus;
+    uint16_t vcpu_index;
+};
+typedef struct xen_domctl_sched_rt_params xen_domctl_sched_rt_params_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_sched_rt_params_t);
 
 /* XEN_DOMCTL_scheduler_op */
 /* Scheduler types. */
@@ -346,6 +360,8 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t);
 #define XEN_SCHEDULER_CREDIT   5
 #define XEN_SCHEDULER_CREDIT2  6
 #define XEN_SCHEDULER_ARINC653 7
+#define XEN_SCHEDULER_RT       8
+
 /* Set or get info? */
 #define XEN_DOMCTL_SCHEDOP_putinfo 0
 #define XEN_DOMCTL_SCHEDOP_getinfo 1
@@ -367,6 +383,9 @@ struct xen_domctl_scheduler_op {
         struct xen_domctl_sched_credit2 {
             uint16_t weight;
         } credit2;
+        struct xen_domctl_sched_rt{
+            XEN_GUEST_HANDLE_64(xen_domctl_sched_rt_params_t) schedule;
+        } rt;
     } u;
 };
 typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t;
diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h
index 4164dff..e452d32 100644
--- a/xen/include/xen/sched-if.h
+++ b/xen/include/xen/sched-if.h
@@ -169,7 +169,7 @@ extern const struct scheduler sched_sedf_def;
 extern const struct scheduler sched_credit_def;
 extern const struct scheduler sched_credit2_def;
 extern const struct scheduler sched_arinc653_def;
-
+extern const struct scheduler sched_rt_def;
 
 struct cpupool
 {
-- 
1.7.9.5


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.