[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen staging] xen/sched: make sched-if.h really scheduler private



commit cbe977f5e0b42931fd76169595c4ab208e0d79af
Author:     Juergen Gross <jgross@xxxxxxxx>
AuthorDate: Thu Nov 7 15:34:37 2019 +0100
Commit:     Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
CommitDate: Wed Jan 22 17:37:11 2020 +0000

    xen/sched: make sched-if.h really scheduler private
    
    include/xen/sched-if.h should be private to scheduler code, so move it
    to common/sched/private.h and move the remaining use cases to
    cpupool.c and core.c.
    
    Signed-off-by: Juergen Gross <jgross@xxxxxxxx>
    Reviewed-by: Dario Faggioli <dfaggioli@xxxxxxxx>
---
 xen/arch/x86/dom0_build.c   |   5 +-
 xen/common/domain.c         |  70 -----
 xen/common/domctl.c         | 135 +---------
 xen/common/sched/arinc653.c |   3 +-
 xen/common/sched/core.c     | 191 +++++++++++++-
 xen/common/sched/cpupool.c  |  13 +-
 xen/common/sched/credit.c   |   2 +-
 xen/common/sched/credit2.c  |   3 +-
 xen/common/sched/null.c     |   3 +-
 xen/common/sched/private.h  | 622 +++++++++++++++++++++++++++++++++++++++++++
 xen/common/sched/rt.c       |   3 +-
 xen/include/xen/domain.h    |   3 +
 xen/include/xen/sched-if.h  | 625 --------------------------------------------
 xen/include/xen/sched.h     |   7 +
 14 files changed, 850 insertions(+), 835 deletions(-)

diff --git a/xen/arch/x86/dom0_build.c b/xen/arch/x86/dom0_build.c
index 28b964e018..56c2dee0fc 100644
--- a/xen/arch/x86/dom0_build.c
+++ b/xen/arch/x86/dom0_build.c
@@ -9,7 +9,6 @@
 #include <xen/libelf.h>
 #include <xen/pfn.h>
 #include <xen/sched.h>
-#include <xen/sched-if.h>
 #include <xen/softirq.h>
 
 #include <asm/amd.h>
@@ -227,9 +226,9 @@ unsigned int __init dom0_max_vcpus(void)
         dom0_nodes = node_online_map;
     for_each_node_mask ( node, dom0_nodes )
         cpumask_or(&dom0_cpus, &dom0_cpus, &node_to_cpumask(node));
-    cpumask_and(&dom0_cpus, &dom0_cpus, cpupool0->cpu_valid);
+    cpumask_and(&dom0_cpus, &dom0_cpus, cpupool_valid_cpus(cpupool0));
     if ( cpumask_empty(&dom0_cpus) )
-        cpumask_copy(&dom0_cpus, cpupool0->cpu_valid);
+        cpumask_copy(&dom0_cpus, cpupool_valid_cpus(cpupool0));
 
     max_vcpus = cpumask_weight(&dom0_cpus);
     if ( opt_dom0_max_vcpus_min > max_vcpus )
diff --git a/xen/common/domain.c b/xen/common/domain.c
index ee3f9ffd3e..dfea575b49 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -10,7 +10,6 @@
 #include <xen/ctype.h>
 #include <xen/err.h>
 #include <xen/sched.h>
-#include <xen/sched-if.h>
 #include <xen/domain.h>
 #include <xen/mm.h>
 #include <xen/event.h>
@@ -577,75 +576,6 @@ void __init setup_system_domains(void)
 #endif
 }
 
-void domain_update_node_affinity(struct domain *d)
-{
-    cpumask_var_t dom_cpumask, dom_cpumask_soft;
-    cpumask_t *dom_affinity;
-    const cpumask_t *online;
-    struct sched_unit *unit;
-    unsigned int cpu;
-
-    /* Do we have vcpus already? If not, no need to update node-affinity. */
-    if ( !d->vcpu || !d->vcpu[0] )
-        return;
-
-    if ( !zalloc_cpumask_var(&dom_cpumask) )
-        return;
-    if ( !zalloc_cpumask_var(&dom_cpumask_soft) )
-    {
-        free_cpumask_var(dom_cpumask);
-        return;
-    }
-
-    online = cpupool_domain_master_cpumask(d);
-
-    spin_lock(&d->node_affinity_lock);
-
-    /*
-     * If d->auto_node_affinity is true, let's compute the domain's
-     * node-affinity and update d->node_affinity accordingly. if false,
-     * just leave d->auto_node_affinity alone.
-     */
-    if ( d->auto_node_affinity )
-    {
-        /*
-         * We want the narrowest possible set of pcpus (to get the narowest
-         * possible set of nodes). What we need is the cpumask of where the
-         * domain can run (the union of the hard affinity of all its vcpus),
-         * and the full mask of where it would prefer to run (the union of
-         * the soft affinity of all its various vcpus). Let's build them.
-         */
-        for_each_sched_unit ( d, unit )
-        {
-            cpumask_or(dom_cpumask, dom_cpumask, unit->cpu_hard_affinity);
-            cpumask_or(dom_cpumask_soft, dom_cpumask_soft,
-                       unit->cpu_soft_affinity);
-        }
-        /* Filter out non-online cpus */
-        cpumask_and(dom_cpumask, dom_cpumask, online);
-        ASSERT(!cpumask_empty(dom_cpumask));
-        /* And compute the intersection between hard, online and soft */
-        cpumask_and(dom_cpumask_soft, dom_cpumask_soft, dom_cpumask);
-
-        /*
-         * If not empty, the intersection of hard, soft and online is the
-         * narrowest set we want. If empty, we fall back to hard&online.
-         */
-        dom_affinity = cpumask_empty(dom_cpumask_soft) ?
-                           dom_cpumask : dom_cpumask_soft;
-
-        nodes_clear(d->node_affinity);
-        for_each_cpu ( cpu, dom_affinity )
-            node_set(cpu_to_node(cpu), d->node_affinity);
-    }
-
-    spin_unlock(&d->node_affinity_lock);
-
-    free_cpumask_var(dom_cpumask_soft);
-    free_cpumask_var(dom_cpumask);
-}
-
-
 int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity)
 {
     /* Being disjoint with the system is just wrong. */
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
index 650310e874..8b819f56e5 100644
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -11,7 +11,6 @@
 #include <xen/err.h>
 #include <xen/mm.h>
 #include <xen/sched.h>
-#include <xen/sched-if.h>
 #include <xen/domain.h>
 #include <xen/event.h>
 #include <xen/grant_table.h>
@@ -65,9 +64,9 @@ static int bitmap_to_xenctl_bitmap(struct xenctl_bitmap 
*xenctl_bitmap,
     return err;
 }
 
-static int xenctl_bitmap_to_bitmap(unsigned long *bitmap,
-                                   const struct xenctl_bitmap *xenctl_bitmap,
-                                   unsigned int nbits)
+int xenctl_bitmap_to_bitmap(unsigned long *bitmap,
+                            const struct xenctl_bitmap *xenctl_bitmap,
+                            unsigned int nbits)
 {
     unsigned int guest_bytes, copy_bytes;
     int err = 0;
@@ -200,7 +199,7 @@ void getdomaininfo(struct domain *d, struct 
xen_domctl_getdomaininfo *info)
     info->shared_info_frame = mfn_to_gmfn(d, virt_to_mfn(d->shared_info));
     BUG_ON(SHARED_M2P(info->shared_info_frame));
 
-    info->cpupool = d->cpupool ? d->cpupool->cpupool_id : CPUPOOLID_NONE;
+    info->cpupool = cpupool_get_id(d);
 
     memcpy(info->handle, d->handle, sizeof(xen_domain_handle_t));
 
@@ -234,16 +233,6 @@ void domctl_lock_release(void)
     spin_unlock(&current->domain->hypercall_deadlock_mutex);
 }
 
-static inline
-int vcpuaffinity_params_invalid(const struct xen_domctl_vcpuaffinity *vcpuaff)
-{
-    return vcpuaff->flags == 0 ||
-           ((vcpuaff->flags & XEN_VCPUAFFINITY_HARD) &&
-            guest_handle_is_null(vcpuaff->cpumap_hard.bitmap)) ||
-           ((vcpuaff->flags & XEN_VCPUAFFINITY_SOFT) &&
-            guest_handle_is_null(vcpuaff->cpumap_soft.bitmap));
-}
-
 void vnuma_destroy(struct vnuma_info *vnuma)
 {
     if ( vnuma )
@@ -608,122 +597,8 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) 
u_domctl)
 
     case XEN_DOMCTL_setvcpuaffinity:
     case XEN_DOMCTL_getvcpuaffinity:
-    {
-        struct vcpu *v;
-        const struct sched_unit *unit;
-        struct xen_domctl_vcpuaffinity *vcpuaff = &op->u.vcpuaffinity;
-
-        ret = -EINVAL;
-        if ( vcpuaff->vcpu >= d->max_vcpus )
-            break;
-
-        ret = -ESRCH;
-        if ( (v = d->vcpu[vcpuaff->vcpu]) == NULL )
-            break;
-
-        unit = v->sched_unit;
-        ret = -EINVAL;
-        if ( vcpuaffinity_params_invalid(vcpuaff) )
-            break;
-
-        if ( op->cmd == XEN_DOMCTL_setvcpuaffinity )
-        {
-            cpumask_var_t new_affinity, old_affinity;
-            cpumask_t *online = cpupool_domain_master_cpumask(v->domain);
-
-            /*
-             * We want to be able to restore hard affinity if we are trying
-             * setting both and changing soft affinity (which happens later,
-             * when hard affinity has been succesfully chaged already) fails.
-             */
-            if ( !alloc_cpumask_var(&old_affinity) )
-            {
-                ret = -ENOMEM;
-                break;
-            }
-            cpumask_copy(old_affinity, unit->cpu_hard_affinity);
-
-            if ( !alloc_cpumask_var(&new_affinity) )
-            {
-                free_cpumask_var(old_affinity);
-                ret = -ENOMEM;
-                break;
-            }
-
-            /* Undo a stuck SCHED_pin_override? */
-            if ( vcpuaff->flags & XEN_VCPUAFFINITY_FORCE )
-                vcpu_temporary_affinity(v, NR_CPUS, VCPU_AFFINITY_OVERRIDE);
-
-            ret = 0;
-
-            /*
-             * We both set a new affinity and report back to the caller what
-             * the scheduler will be effectively using.
-             */
-            if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD )
-            {
-                ret = xenctl_bitmap_to_bitmap(cpumask_bits(new_affinity),
-                                              &vcpuaff->cpumap_hard,
-                                              nr_cpu_ids);
-                if ( !ret )
-                    ret = vcpu_set_hard_affinity(v, new_affinity);
-                if ( ret )
-                    goto setvcpuaffinity_out;
-
-                /*
-                 * For hard affinity, what we return is the intersection of
-                 * cpupool's online mask and the new hard affinity.
-                 */
-                cpumask_and(new_affinity, online, unit->cpu_hard_affinity);
-                ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_hard,
-                                               new_affinity);
-            }
-            if ( vcpuaff->flags & XEN_VCPUAFFINITY_SOFT )
-            {
-                ret = xenctl_bitmap_to_bitmap(cpumask_bits(new_affinity),
-                                              &vcpuaff->cpumap_soft,
-                                              nr_cpu_ids);
-                if ( !ret)
-                    ret = vcpu_set_soft_affinity(v, new_affinity);
-                if ( ret )
-                {
-                    /*
-                     * Since we're returning error, the caller expects nothing
-                     * happened, so we rollback the changes to hard affinity
-                     * (if any).
-                     */
-                    if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD )
-                        vcpu_set_hard_affinity(v, old_affinity);
-                    goto setvcpuaffinity_out;
-                }
-
-                /*
-                 * For soft affinity, we return the intersection between the
-                 * new soft affinity, the cpupool's online map and the (new)
-                 * hard affinity.
-                 */
-                cpumask_and(new_affinity, new_affinity, online);
-                cpumask_and(new_affinity, new_affinity,
-                            unit->cpu_hard_affinity);
-                ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_soft,
-                                               new_affinity);
-            }
-
- setvcpuaffinity_out:
-            free_cpumask_var(new_affinity);
-            free_cpumask_var(old_affinity);
-        }
-        else
-        {
-            if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD )
-                ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_hard,
-                                               unit->cpu_hard_affinity);
-            if ( vcpuaff->flags & XEN_VCPUAFFINITY_SOFT )
-                ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_soft,
-                                               unit->cpu_soft_affinity);
-        }
+        ret = vcpu_affinity_domctl(d, op->cmd, &op->u.vcpuaffinity);
         break;
-    }
 
     case XEN_DOMCTL_scheduler_op:
         ret = sched_adjust(d, &op->u.scheduler_op);
diff --git a/xen/common/sched/arinc653.c b/xen/common/sched/arinc653.c
index 565575c326..8895d92b5e 100644
--- a/xen/common/sched/arinc653.c
+++ b/xen/common/sched/arinc653.c
@@ -26,7 +26,6 @@
 
 #include <xen/lib.h>
 #include <xen/sched.h>
-#include <xen/sched-if.h>
 #include <xen/timer.h>
 #include <xen/softirq.h>
 #include <xen/time.h>
@@ -35,6 +34,8 @@
 #include <xen/guest_access.h>
 #include <public/sysctl.h>
 
+#include "private.h"
+
 /**************************************************************************
  * Private Macros                                                         *
  **************************************************************************/
diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
index 4d8eb4c617..2fae959e90 100644
--- a/xen/common/sched/core.c
+++ b/xen/common/sched/core.c
@@ -23,7 +23,6 @@
 #include <xen/time.h>
 #include <xen/timer.h>
 #include <xen/perfc.h>
-#include <xen/sched-if.h>
 #include <xen/softirq.h>
 #include <xen/trace.h>
 #include <xen/mm.h>
@@ -38,6 +37,8 @@
 #include <xsm/xsm.h>
 #include <xen/err.h>
 
+#include "private.h"
+
 #ifdef CONFIG_XEN_GUEST
 #include <asm/guest.h>
 #else
@@ -1607,6 +1608,194 @@ int vcpu_temporary_affinity(struct vcpu *v, unsigned 
int cpu, uint8_t reason)
     return ret;
 }
 
+static inline
+int vcpuaffinity_params_invalid(const struct xen_domctl_vcpuaffinity *vcpuaff)
+{
+    return vcpuaff->flags == 0 ||
+           ((vcpuaff->flags & XEN_VCPUAFFINITY_HARD) &&
+            guest_handle_is_null(vcpuaff->cpumap_hard.bitmap)) ||
+           ((vcpuaff->flags & XEN_VCPUAFFINITY_SOFT) &&
+            guest_handle_is_null(vcpuaff->cpumap_soft.bitmap));
+}
+
+int vcpu_affinity_domctl(struct domain *d, uint32_t cmd,
+                         struct xen_domctl_vcpuaffinity *vcpuaff)
+{
+    struct vcpu *v;
+    const struct sched_unit *unit;
+    int ret = 0;
+
+    if ( vcpuaff->vcpu >= d->max_vcpus )
+        return -EINVAL;
+
+    if ( (v = d->vcpu[vcpuaff->vcpu]) == NULL )
+        return -ESRCH;
+
+    if ( vcpuaffinity_params_invalid(vcpuaff) )
+        return -EINVAL;
+
+    unit = v->sched_unit;
+
+    if ( cmd == XEN_DOMCTL_setvcpuaffinity )
+    {
+        cpumask_var_t new_affinity, old_affinity;
+        cpumask_t *online = cpupool_domain_master_cpumask(v->domain);
+
+        /*
+         * We want to be able to restore hard affinity if we are trying
+         * setting both and changing soft affinity (which happens later,
+         * when hard affinity has been succesfully chaged already) fails.
+         */
+        if ( !alloc_cpumask_var(&old_affinity) )
+            return -ENOMEM;
+
+        cpumask_copy(old_affinity, unit->cpu_hard_affinity);
+
+        if ( !alloc_cpumask_var(&new_affinity) )
+        {
+            free_cpumask_var(old_affinity);
+            return -ENOMEM;
+        }
+
+        /* Undo a stuck SCHED_pin_override? */
+        if ( vcpuaff->flags & XEN_VCPUAFFINITY_FORCE )
+            vcpu_temporary_affinity(v, NR_CPUS, VCPU_AFFINITY_OVERRIDE);
+
+        ret = 0;
+
+        /*
+         * We both set a new affinity and report back to the caller what
+         * the scheduler will be effectively using.
+         */
+        if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD )
+        {
+            ret = xenctl_bitmap_to_bitmap(cpumask_bits(new_affinity),
+                                          &vcpuaff->cpumap_hard, nr_cpu_ids);
+            if ( !ret )
+                ret = vcpu_set_hard_affinity(v, new_affinity);
+            if ( ret )
+                goto setvcpuaffinity_out;
+
+            /*
+             * For hard affinity, what we return is the intersection of
+             * cpupool's online mask and the new hard affinity.
+             */
+            cpumask_and(new_affinity, online, unit->cpu_hard_affinity);
+            ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_hard, 
new_affinity);
+        }
+        if ( vcpuaff->flags & XEN_VCPUAFFINITY_SOFT )
+        {
+            ret = xenctl_bitmap_to_bitmap(cpumask_bits(new_affinity),
+                                          &vcpuaff->cpumap_soft, nr_cpu_ids);
+            if ( !ret)
+                ret = vcpu_set_soft_affinity(v, new_affinity);
+            if ( ret )
+            {
+                /*
+                 * Since we're returning error, the caller expects nothing
+                 * happened, so we rollback the changes to hard affinity
+                 * (if any).
+                 */
+                if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD )
+                    vcpu_set_hard_affinity(v, old_affinity);
+                goto setvcpuaffinity_out;
+            }
+
+            /*
+             * For soft affinity, we return the intersection between the
+             * new soft affinity, the cpupool's online map and the (new)
+             * hard affinity.
+             */
+            cpumask_and(new_affinity, new_affinity, online);
+            cpumask_and(new_affinity, new_affinity, unit->cpu_hard_affinity);
+            ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_soft, 
new_affinity);
+        }
+
+ setvcpuaffinity_out:
+        free_cpumask_var(new_affinity);
+        free_cpumask_var(old_affinity);
+    }
+    else
+    {
+        if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD )
+            ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_hard,
+                                           unit->cpu_hard_affinity);
+        if ( vcpuaff->flags & XEN_VCPUAFFINITY_SOFT )
+            ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_soft,
+                                           unit->cpu_soft_affinity);
+    }
+
+    return ret;
+}
+
+void domain_update_node_affinity(struct domain *d)
+{
+    cpumask_var_t dom_cpumask, dom_cpumask_soft;
+    cpumask_t *dom_affinity;
+    const cpumask_t *online;
+    struct sched_unit *unit;
+    unsigned int cpu;
+
+    /* Do we have vcpus already? If not, no need to update node-affinity. */
+    if ( !d->vcpu || !d->vcpu[0] )
+        return;
+
+    if ( !zalloc_cpumask_var(&dom_cpumask) )
+        return;
+    if ( !zalloc_cpumask_var(&dom_cpumask_soft) )
+    {
+        free_cpumask_var(dom_cpumask);
+        return;
+    }
+
+    online = cpupool_domain_master_cpumask(d);
+
+    spin_lock(&d->node_affinity_lock);
+
+    /*
+     * If d->auto_node_affinity is true, let's compute the domain's
+     * node-affinity and update d->node_affinity accordingly. if false,
+     * just leave d->auto_node_affinity alone.
+     */
+    if ( d->auto_node_affinity )
+    {
+        /*
+         * We want the narrowest possible set of pcpus (to get the narowest
+         * possible set of nodes). What we need is the cpumask of where the
+         * domain can run (the union of the hard affinity of all its vcpus),
+         * and the full mask of where it would prefer to run (the union of
+         * the soft affinity of all its various vcpus). Let's build them.
+         */
+        for_each_sched_unit ( d, unit )
+        {
+            cpumask_or(dom_cpumask, dom_cpumask, unit->cpu_hard_affinity);
+            cpumask_or(dom_cpumask_soft, dom_cpumask_soft,
+                       unit->cpu_soft_affinity);
+        }
+        /* Filter out non-online cpus */
+        cpumask_and(dom_cpumask, dom_cpumask, online);
+        ASSERT(!cpumask_empty(dom_cpumask));
+        /* And compute the intersection between hard, online and soft */
+        cpumask_and(dom_cpumask_soft, dom_cpumask_soft, dom_cpumask);
+
+        /*
+         * If not empty, the intersection of hard, soft and online is the
+         * narrowest set we want. If empty, we fall back to hard&online.
+         */
+        dom_affinity = cpumask_empty(dom_cpumask_soft) ?
+                           dom_cpumask : dom_cpumask_soft;
+
+        nodes_clear(d->node_affinity);
+        for_each_cpu ( cpu, dom_affinity )
+            node_set(cpu_to_node(cpu), d->node_affinity);
+    }
+
+    spin_unlock(&d->node_affinity_lock);
+
+    free_cpumask_var(dom_cpumask_soft);
+    free_cpumask_var(dom_cpumask);
+}
+
 typedef long ret_t;
 
 #endif /* !COMPAT */
diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c
index d66b541a94..3060a7144a 100644
--- a/xen/common/sched/cpupool.c
+++ b/xen/common/sched/cpupool.c
@@ -16,11 +16,12 @@
 #include <xen/cpumask.h>
 #include <xen/percpu.h>
 #include <xen/sched.h>
-#include <xen/sched-if.h>
 #include <xen/warning.h>
 #include <xen/keyhandler.h>
 #include <xen/cpu.h>
 
+#include "private.h"
+
 #define for_each_cpupool(ptr)    \
     for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next))
 
@@ -875,6 +876,16 @@ int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op)
     return ret;
 }
 
+int cpupool_get_id(const struct domain *d)
+{
+    return d->cpupool ? d->cpupool->cpupool_id : CPUPOOLID_NONE;
+}
+
+const cpumask_t *cpupool_valid_cpus(const struct cpupool *pool)
+{
+    return pool->cpu_valid;
+}
+
 void dump_runq(unsigned char key)
 {
     unsigned long    flags;
diff --git a/xen/common/sched/credit.c b/xen/common/sched/credit.c
index aa41a3301b..4329d9df56 100644
--- a/xen/common/sched/credit.c
+++ b/xen/common/sched/credit.c
@@ -15,7 +15,6 @@
 #include <xen/delay.h>
 #include <xen/event.h>
 #include <xen/time.h>
-#include <xen/sched-if.h>
 #include <xen/softirq.h>
 #include <asm/atomic.h>
 #include <asm/div64.h>
@@ -24,6 +23,7 @@
 #include <xen/trace.h>
 #include <xen/err.h>
 
+#include "private.h"
 
 /*
  * Locking:
diff --git a/xen/common/sched/credit2.c b/xen/common/sched/credit2.c
index f7c477053c..65e8ab052e 100644
--- a/xen/common/sched/credit2.c
+++ b/xen/common/sched/credit2.c
@@ -18,7 +18,6 @@
 #include <xen/event.h>
 #include <xen/time.h>
 #include <xen/perfc.h>
-#include <xen/sched-if.h>
 #include <xen/softirq.h>
 #include <asm/div64.h>
 #include <xen/errno.h>
@@ -26,6 +25,8 @@
 #include <xen/cpu.h>
 #include <xen/keyhandler.h>
 
+#include "private.h"
+
 /* Meant only for helping developers during debugging. */
 /* #define d2printk printk */
 #define d2printk(x...)
diff --git a/xen/common/sched/null.c b/xen/common/sched/null.c
index 3f3418c9b1..b99f1e3c65 100644
--- a/xen/common/sched/null.c
+++ b/xen/common/sched/null.c
@@ -29,10 +29,11 @@
  */
 
 #include <xen/sched.h>
-#include <xen/sched-if.h>
 #include <xen/softirq.h>
 #include <xen/trace.h>
 
+#include "private.h"
+
 /*
  * null tracing events. Check include/public/trace.h for more details.
  */
diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h
new file mode 100644
index 0000000000..a702fd23b1
--- /dev/null
+++ b/xen/common/sched/private.h
@@ -0,0 +1,622 @@
+/******************************************************************************
+ * Additional declarations for the generic scheduler interface.  This should
+ * only be included by files that implement conforming schedulers.
+ *
+ * Portions by Mark Williamson are (C) 2004 Intel Research Cambridge
+ */
+
+#ifndef __XEN_SCHED_IF_H__
+#define __XEN_SCHED_IF_H__
+
+#include <xen/percpu.h>
+#include <xen/err.h>
+#include <xen/rcupdate.h>
+
+/* cpus currently in no cpupool */
+extern cpumask_t cpupool_free_cpus;
+
+/* Scheduler generic parameters
+ * */
+#define SCHED_DEFAULT_RATELIMIT_US 1000
+extern int sched_ratelimit_us;
+
+/* Scheduling resource mask. */
+extern cpumask_t sched_res_mask;
+
+/* Number of vcpus per struct sched_unit. */
+enum sched_gran {
+    SCHED_GRAN_cpu,
+    SCHED_GRAN_core,
+    SCHED_GRAN_socket
+};
+
+/*
+ * In order to allow a scheduler to remap the lock->cpu mapping,
+ * we have a per-cpu pointer, along with a pre-allocated set of
+ * locks.  The generic schedule init code will point each schedule lock
+ * pointer to the schedule lock; if the scheduler wants to remap them,
+ * it can simply modify the schedule locks.
+ * 
+ * For cache betterness, keep the actual lock in the same cache area
+ * as the rest of the struct.  Just have the scheduler point to the
+ * one it wants (This may be the one right in front of it).*/
+struct sched_resource {
+    struct scheduler   *scheduler;
+    struct cpupool     *cpupool;
+    spinlock_t         *schedule_lock,
+                       _lock;
+    struct sched_unit  *curr;
+    struct sched_unit  *sched_unit_idle;
+    struct sched_unit  *prev;
+    void               *sched_priv;
+    struct timer        s_timer;        /* scheduling timer                */
+
+    /* Cpu with lowest id in scheduling resource. */
+    unsigned int        master_cpu;
+    unsigned int        granularity;
+    cpumask_var_t       cpus;           /* cpus covered by this struct     */
+    struct rcu_head     rcu;
+};
+
+DECLARE_PER_CPU(struct sched_resource *, sched_res);
+extern rcu_read_lock_t sched_res_rculock;
+
+static inline struct sched_resource *get_sched_res(unsigned int cpu)
+{
+    return rcu_dereference(per_cpu(sched_res, cpu));
+}
+
+static inline void set_sched_res(unsigned int cpu, struct sched_resource *res)
+{
+    rcu_assign_pointer(per_cpu(sched_res, cpu), res);
+}
+
+static inline struct sched_unit *curr_on_cpu(unsigned int cpu)
+{
+    return get_sched_res(cpu)->curr;
+}
+
+static inline bool is_idle_unit(const struct sched_unit *unit)
+{
+    return is_idle_vcpu(unit->vcpu_list);
+}
+
+/* Returns true if at least one vcpu of the unit is online. */
+static inline bool is_unit_online(const struct sched_unit *unit)
+{
+    const struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        if ( is_vcpu_online(v) )
+            return true;
+
+    return false;
+}
+
+static inline unsigned int unit_running(const struct sched_unit *unit)
+{
+    return unit->runstate_cnt[RUNSTATE_running];
+}
+
+/* Returns true if at least one vcpu of the unit is runnable. */
+static inline bool unit_runnable(const struct sched_unit *unit)
+{
+    const struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        if ( vcpu_runnable(v) )
+            return true;
+
+    return false;
+}
+
+static inline int vcpu_runstate_blocked(const struct vcpu *v)
+{
+    return (v->pause_flags & VPF_blocked) ? RUNSTATE_blocked : 
RUNSTATE_offline;
+}
+
+/*
+ * Returns whether a sched_unit is runnable and sets new_state for each of its
+ * vcpus. It is mandatory to determine the new runstate for all vcpus of a unit
+ * without dropping the schedule lock (which happens when synchronizing the
+ * context switch of the vcpus of a unit) in order to avoid races with e.g.
+ * vcpu_sleep().
+ */
+static inline bool unit_runnable_state(const struct sched_unit *unit)
+{
+    struct vcpu *v;
+    bool runnable, ret = false;
+
+    if ( is_idle_unit(unit) )
+        return true;
+
+    for_each_sched_unit_vcpu ( unit, v )
+    {
+        runnable = vcpu_runnable(v);
+
+        v->new_state = runnable ? RUNSTATE_running : vcpu_runstate_blocked(v);
+
+        if ( runnable )
+            ret = true;
+    }
+
+    return ret;
+}
+
+static inline void sched_set_res(struct sched_unit *unit,
+                                 struct sched_resource *res)
+{
+    unsigned int cpu = cpumask_first(res->cpus);
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+    {
+        ASSERT(cpu < nr_cpu_ids);
+        v->processor = cpu;
+        cpu = cpumask_next(cpu, res->cpus);
+    }
+
+    unit->res = res;
+}
+
+/* Return master cpu of the scheduling resource the unit is assigned to. */
+static inline unsigned int sched_unit_master(const struct sched_unit *unit)
+{
+    return unit->res->master_cpu;
+}
+
+/* Set a bit in pause_flags of all vcpus of a unit. */
+static inline void sched_set_pause_flags(struct sched_unit *unit,
+                                         unsigned int bit)
+{
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        __set_bit(bit, &v->pause_flags);
+}
+
+/* Clear a bit in pause_flags of all vcpus of a unit. */
+static inline void sched_clear_pause_flags(struct sched_unit *unit,
+                                           unsigned int bit)
+{
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        __clear_bit(bit, &v->pause_flags);
+}
+
+/* Set a bit in pause_flags of all vcpus of a unit via atomic updates. */
+static inline void sched_set_pause_flags_atomic(struct sched_unit *unit,
+                                                unsigned int bit)
+{
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        set_bit(bit, &v->pause_flags);
+}
+
+/* Clear a bit in pause_flags of all vcpus of a unit via atomic updates. */
+static inline void sched_clear_pause_flags_atomic(struct sched_unit *unit,
+                                                  unsigned int bit)
+{
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        clear_bit(bit, &v->pause_flags);
+}
+
+static inline struct sched_unit *sched_idle_unit(unsigned int cpu)
+{
+    return get_sched_res(cpu)->sched_unit_idle;
+}
+
+static inline unsigned int sched_get_resource_cpu(unsigned int cpu)
+{
+    return get_sched_res(cpu)->master_cpu;
+}
+
+/*
+ * Scratch space, for avoiding having too many cpumask_t on the stack.
+ * Within each scheduler, when using the scratch mask of one pCPU:
+ * - the pCPU must belong to the scheduler,
+ * - the caller must own the per-pCPU scheduler lock (a.k.a. runqueue
+ *   lock).
+ */
+DECLARE_PER_CPU(cpumask_t, cpumask_scratch);
+#define cpumask_scratch        (&this_cpu(cpumask_scratch))
+#define cpumask_scratch_cpu(c) (&per_cpu(cpumask_scratch, c))
+
+#define sched_lock(kind, param, cpu, irq, arg...) \
+static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \
+{ \
+    for ( ; ; ) \
+    { \
+        spinlock_t *lock = get_sched_res(cpu)->schedule_lock; \
+        /* \
+         * v->processor may change when grabbing the lock; but \
+         * per_cpu(v->processor) may also change, if changing cpu pool \
+         * also changes the scheduler lock.  Retry until they match. \
+         * \
+         * It may also be the case that v->processor may change but the \
+         * lock may be the same; this will succeed in that case. \
+         */ \
+        spin_lock##irq(lock, ## arg); \
+        if ( likely(lock == get_sched_res(cpu)->schedule_lock) ) \
+            return lock; \
+        spin_unlock##irq(lock, ## arg); \
+    } \
+}
+
+#define sched_unlock(kind, param, cpu, irq, arg...) \
+static inline void kind##_schedule_unlock##irq(spinlock_t *lock \
+                                               EXTRA_TYPE(arg), param) \
+{ \
+    ASSERT(lock == get_sched_res(cpu)->schedule_lock); \
+    spin_unlock##irq(lock, ## arg); \
+}
+
+#define EXTRA_TYPE(arg)
+sched_lock(pcpu, unsigned int cpu,     cpu, )
+sched_lock(unit, const struct sched_unit *i, i->res->master_cpu, )
+sched_lock(pcpu, unsigned int cpu,     cpu,          _irq)
+sched_lock(unit, const struct sched_unit *i, i->res->master_cpu, _irq)
+sched_unlock(pcpu, unsigned int cpu,     cpu, )
+sched_unlock(unit, const struct sched_unit *i, i->res->master_cpu, )
+sched_unlock(pcpu, unsigned int cpu,     cpu,          _irq)
+sched_unlock(unit, const struct sched_unit *i, i->res->master_cpu, _irq)
+#undef EXTRA_TYPE
+
+#define EXTRA_TYPE(arg) , unsigned long arg
+#define spin_unlock_irqsave spin_unlock_irqrestore
+sched_lock(pcpu, unsigned int cpu,     cpu,          _irqsave, *flags)
+sched_lock(unit, const struct sched_unit *i, i->res->master_cpu, _irqsave, 
*flags)
+#undef spin_unlock_irqsave
+sched_unlock(pcpu, unsigned int cpu,     cpu,          _irqrestore, flags)
+sched_unlock(unit, const struct sched_unit *i, i->res->master_cpu, 
_irqrestore, flags)
+#undef EXTRA_TYPE
+
+#undef sched_unlock
+#undef sched_lock
+
+static inline spinlock_t *pcpu_schedule_trylock(unsigned int cpu)
+{
+    spinlock_t *lock = get_sched_res(cpu)->schedule_lock;
+
+    if ( !spin_trylock(lock) )
+        return NULL;
+    if ( lock == get_sched_res(cpu)->schedule_lock )
+        return lock;
+    spin_unlock(lock);
+    return NULL;
+}
+
+struct scheduler {
+    char *name;             /* full name for this scheduler      */
+    char *opt_name;         /* option name for this scheduler    */
+    unsigned int sched_id;  /* ID for this scheduler             */
+    void *sched_data;       /* global data pointer               */
+
+    int          (*global_init)    (void);
+
+    int          (*init)           (struct scheduler *);
+    void         (*deinit)         (struct scheduler *);
+
+    void         (*free_udata)     (const struct scheduler *, void *);
+    void *       (*alloc_udata)    (const struct scheduler *,
+                                    struct sched_unit *, void *);
+    void         (*free_pdata)     (const struct scheduler *, void *, int);
+    void *       (*alloc_pdata)    (const struct scheduler *, int);
+    void         (*init_pdata)     (const struct scheduler *, void *, int);
+    void         (*deinit_pdata)   (const struct scheduler *, void *, int);
+
+    /* Returns ERR_PTR(-err) for error, NULL for 'nothing needed'. */
+    void *       (*alloc_domdata)  (const struct scheduler *, struct domain *);
+    /* Idempotent. */
+    void         (*free_domdata)   (const struct scheduler *, void *);
+
+    spinlock_t * (*switch_sched)   (struct scheduler *, unsigned int,
+                                    void *, void *);
+
+    /* Activate / deactivate units in a cpu pool */
+    void         (*insert_unit)    (const struct scheduler *,
+                                    struct sched_unit *);
+    void         (*remove_unit)    (const struct scheduler *,
+                                    struct sched_unit *);
+
+    void         (*sleep)          (const struct scheduler *,
+                                    struct sched_unit *);
+    void         (*wake)           (const struct scheduler *,
+                                    struct sched_unit *);
+    void         (*yield)          (const struct scheduler *,
+                                    struct sched_unit *);
+    void         (*context_saved)  (const struct scheduler *,
+                                    struct sched_unit *);
+
+    void         (*do_schedule)    (const struct scheduler *,
+                                    struct sched_unit *, s_time_t,
+                                    bool tasklet_work_scheduled);
+
+    struct sched_resource *(*pick_resource)(const struct scheduler *,
+                                            const struct sched_unit *);
+    void         (*migrate)        (const struct scheduler *,
+                                    struct sched_unit *, unsigned int);
+    int          (*adjust)         (const struct scheduler *, struct domain *,
+                                    struct xen_domctl_scheduler_op *);
+    void         (*adjust_affinity)(const struct scheduler *,
+                                    struct sched_unit *,
+                                    const struct cpumask *,
+                                    const struct cpumask *);
+    int          (*adjust_global)  (const struct scheduler *,
+                                    struct xen_sysctl_scheduler_op *);
+    void         (*dump_settings)  (const struct scheduler *);
+    void         (*dump_cpu_state) (const struct scheduler *, int);
+};
+
+static inline int sched_init(struct scheduler *s)
+{
+    return s->init(s);
+}
+
+static inline void sched_deinit(struct scheduler *s)
+{
+    s->deinit(s);
+}
+
+static inline spinlock_t *sched_switch_sched(struct scheduler *s,
+                                             unsigned int cpu,
+                                             void *pdata, void *vdata)
+{
+    return s->switch_sched(s, cpu, pdata, vdata);
+}
+
+static inline void sched_dump_settings(const struct scheduler *s)
+{
+    if ( s->dump_settings )
+        s->dump_settings(s);
+}
+
+static inline void sched_dump_cpu_state(const struct scheduler *s, int cpu)
+{
+    if ( s->dump_cpu_state )
+        s->dump_cpu_state(s, cpu);
+}
+
+static inline void *sched_alloc_domdata(const struct scheduler *s,
+                                        struct domain *d)
+{
+    return s->alloc_domdata ? s->alloc_domdata(s, d) : NULL;
+}
+
+static inline void sched_free_domdata(const struct scheduler *s,
+                                      void *data)
+{
+    ASSERT(s->free_domdata || !data);
+    if ( s->free_domdata )
+        s->free_domdata(s, data);
+}
+
+static inline void *sched_alloc_pdata(const struct scheduler *s, int cpu)
+{
+    return s->alloc_pdata ? s->alloc_pdata(s, cpu) : NULL;
+}
+
+static inline void sched_free_pdata(const struct scheduler *s, void *data,
+                                    int cpu)
+{
+    ASSERT(s->free_pdata || !data);
+    if ( s->free_pdata )
+        s->free_pdata(s, data, cpu);
+}
+
+static inline void sched_init_pdata(const struct scheduler *s, void *data,
+                                    int cpu)
+{
+    if ( s->init_pdata )
+        s->init_pdata(s, data, cpu);
+}
+
+static inline void sched_deinit_pdata(const struct scheduler *s, void *data,
+                                      int cpu)
+{
+    if ( s->deinit_pdata )
+        s->deinit_pdata(s, data, cpu);
+}
+
+static inline void *sched_alloc_udata(const struct scheduler *s,
+                                      struct sched_unit *unit, void *dom_data)
+{
+    return s->alloc_udata(s, unit, dom_data);
+}
+
+static inline void sched_free_udata(const struct scheduler *s, void *data)
+{
+    s->free_udata(s, data);
+}
+
+static inline void sched_insert_unit(const struct scheduler *s,
+                                     struct sched_unit *unit)
+{
+    if ( s->insert_unit )
+        s->insert_unit(s, unit);
+}
+
+static inline void sched_remove_unit(const struct scheduler *s,
+                                     struct sched_unit *unit)
+{
+    if ( s->remove_unit )
+        s->remove_unit(s, unit);
+}
+
+static inline void sched_sleep(const struct scheduler *s,
+                               struct sched_unit *unit)
+{
+    if ( s->sleep )
+        s->sleep(s, unit);
+}
+
+static inline void sched_wake(const struct scheduler *s,
+                              struct sched_unit *unit)
+{
+    if ( s->wake )
+        s->wake(s, unit);
+}
+
+static inline void sched_yield(const struct scheduler *s,
+                               struct sched_unit *unit)
+{
+    if ( s->yield )
+        s->yield(s, unit);
+}
+
+static inline void sched_context_saved(const struct scheduler *s,
+                                       struct sched_unit *unit)
+{
+    if ( s->context_saved )
+        s->context_saved(s, unit);
+}
+
+static inline void sched_migrate(const struct scheduler *s,
+                                 struct sched_unit *unit, unsigned int cpu)
+{
+    if ( s->migrate )
+        s->migrate(s, unit, cpu);
+    else
+        sched_set_res(unit, get_sched_res(cpu));
+}
+
+static inline struct sched_resource *sched_pick_resource(
+    const struct scheduler *s, const struct sched_unit *unit)
+{
+    return s->pick_resource(s, unit);
+}
+
+static inline void sched_adjust_affinity(const struct scheduler *s,
+                                         struct sched_unit *unit,
+                                         const cpumask_t *hard,
+                                         const cpumask_t *soft)
+{
+    if ( s->adjust_affinity )
+        s->adjust_affinity(s, unit, hard, soft);
+}
+
+static inline int sched_adjust_dom(const struct scheduler *s, struct domain *d,
+                                   struct xen_domctl_scheduler_op *op)
+{
+    return s->adjust ? s->adjust(s, d, op) : 0;
+}
+
+static inline int sched_adjust_cpupool(const struct scheduler *s,
+                                       struct xen_sysctl_scheduler_op *op)
+{
+    return s->adjust_global ? s->adjust_global(s, op) : 0;
+}
+
+static inline void sched_unit_pause_nosync(const struct sched_unit *unit)
+{
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        vcpu_pause_nosync(v);
+}
+
+static inline void sched_unit_unpause(const struct sched_unit *unit)
+{
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        vcpu_unpause(v);
+}
+
+#define REGISTER_SCHEDULER(x) static const struct scheduler *x##_entry \
+  __used_section(".data.schedulers") = &x;
+
+struct cpupool
+{
+    int              cpupool_id;
+    unsigned int     n_dom;
+    cpumask_var_t    cpu_valid;      /* all cpus assigned to pool */
+    cpumask_var_t    res_valid;      /* all scheduling resources of pool */
+    struct cpupool   *next;
+    struct scheduler *sched;
+    atomic_t         refcnt;
+    enum sched_gran  gran;
+};
+
+static inline cpumask_t *cpupool_domain_master_cpumask(const struct domain *d)
+{
+    /*
+     * d->cpupool is NULL only for the idle domain, and no one should
+     * be interested in calling this for the idle domain.
+     */
+    ASSERT(d->cpupool != NULL);
+    return d->cpupool->res_valid;
+}
+
+unsigned int cpupool_get_granularity(const struct cpupool *c);
+
+/*
+ * Hard and soft affinity load balancing.
+ *
+ * Idea is each vcpu has some pcpus that it prefers, some that it does not
+ * prefer but is OK with, and some that it cannot run on at all. The first
+ * set of pcpus are the ones that are both in the soft affinity *and* in the
+ * hard affinity; the second set of pcpus are the ones that are in the hard
+ * affinity but *not* in the soft affinity; the third set of pcpus are the
+ * ones that are not in the hard affinity.
+ *
+ * We implement a two step balancing logic. Basically, every time there is
+ * the need to decide where to run a vcpu, we first check the soft affinity
+ * (well, actually, the && between soft and hard affinity), to see if we can
+ * send it where it prefers to (and can) run on. However, if the first step
+ * does not find any suitable and free pcpu, we fall back checking the hard
+ * affinity.
+ */
+#define BALANCE_SOFT_AFFINITY    0
+#define BALANCE_HARD_AFFINITY    1
+
+#define for_each_affinity_balance_step(step) \
+    for ( (step) = 0; (step) <= BALANCE_HARD_AFFINITY; (step)++ )
+
+/*
+ * Hard affinity balancing is always necessary and must never be skipped.
+ * But soft affinity need only be considered when it has a functionally
+ * different effect than other constraints (such as hard affinity, cpus
+ * online, or cpupools).
+ *
+ * Soft affinity only needs to be considered if:
+ * * The cpus in the cpupool are not a subset of soft affinity
+ * * The hard affinity is not a subset of soft affinity
+ * * There is an overlap between the soft and hard affinity masks
+ */
+static inline int has_soft_affinity(const struct sched_unit *unit)
+{
+    return unit->soft_aff_effective &&
+           !cpumask_subset(cpupool_domain_master_cpumask(unit->domain),
+                           unit->cpu_soft_affinity);
+}
+
+/*
+ * This function copies in mask the cpumask that should be used for a
+ * particular affinity balancing step. For the soft affinity one, the pcpus
+ * that are not part of vc's hard affinity are filtered out from the result,
+ * to avoid running a vcpu where it would like, but is not allowed to!
+ */
+static inline void
+affinity_balance_cpumask(const struct sched_unit *unit, int step,
+                         cpumask_t *mask)
+{
+    if ( step == BALANCE_SOFT_AFFINITY )
+    {
+        cpumask_and(mask, unit->cpu_soft_affinity, unit->cpu_hard_affinity);
+
+        if ( unlikely(cpumask_empty(mask)) )
+            cpumask_copy(mask, unit->cpu_hard_affinity);
+    }
+    else /* step == BALANCE_HARD_AFFINITY */
+        cpumask_copy(mask, unit->cpu_hard_affinity);
+}
+
+void sched_rm_cpu(unsigned int cpu);
+const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu);
+
+#endif /* __XEN_SCHED_IF_H__ */
diff --git a/xen/common/sched/rt.c b/xen/common/sched/rt.c
index c40a7e4990..a7125aef15 100644
--- a/xen/common/sched/rt.c
+++ b/xen/common/sched/rt.c
@@ -20,7 +20,6 @@
 #include <xen/time.h>
 #include <xen/timer.h>
 #include <xen/perfc.h>
-#include <xen/sched-if.h>
 #include <xen/softirq.h>
 #include <asm/atomic.h>
 #include <xen/errno.h>
@@ -31,6 +30,8 @@
 #include <xen/err.h>
 #include <xen/guest_access.h>
 
+#include "private.h"
+
 /*
  * TODO:
  *
diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h
index 1cb205d977..7e51d361de 100644
--- a/xen/include/xen/domain.h
+++ b/xen/include/xen/domain.h
@@ -27,6 +27,9 @@ struct xen_domctl_getdomaininfo;
 void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info);
 void arch_get_domain_info(const struct domain *d,
                           struct xen_domctl_getdomaininfo *info);
+int xenctl_bitmap_to_bitmap(unsigned long *bitmap,
+                            const struct xenctl_bitmap *xenctl_bitmap,
+                            unsigned int nbits);
 
 /*
  * Arch-specifics.
diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h
deleted file mode 100644
index b0ac54e63d..0000000000
--- a/xen/include/xen/sched-if.h
+++ /dev/null
@@ -1,625 +0,0 @@
-/******************************************************************************
- * Additional declarations for the generic scheduler interface.  This should
- * only be included by files that implement conforming schedulers.
- *
- * Portions by Mark Williamson are (C) 2004 Intel Research Cambridge
- */
-
-#ifndef __XEN_SCHED_IF_H__
-#define __XEN_SCHED_IF_H__
-
-#include <xen/percpu.h>
-#include <xen/err.h>
-#include <xen/rcupdate.h>
-
-/* A global pointer to the initial cpupool (POOL0). */
-extern struct cpupool *cpupool0;
-
-/* cpus currently in no cpupool */
-extern cpumask_t cpupool_free_cpus;
-
-/* Scheduler generic parameters
- * */
-#define SCHED_DEFAULT_RATELIMIT_US 1000
-extern int sched_ratelimit_us;
-
-/* Scheduling resource mask. */
-extern cpumask_t sched_res_mask;
-
-/* Number of vcpus per struct sched_unit. */
-enum sched_gran {
-    SCHED_GRAN_cpu,
-    SCHED_GRAN_core,
-    SCHED_GRAN_socket
-};
-
-/*
- * In order to allow a scheduler to remap the lock->cpu mapping,
- * we have a per-cpu pointer, along with a pre-allocated set of
- * locks.  The generic schedule init code will point each schedule lock
- * pointer to the schedule lock; if the scheduler wants to remap them,
- * it can simply modify the schedule locks.
- * 
- * For cache betterness, keep the actual lock in the same cache area
- * as the rest of the struct.  Just have the scheduler point to the
- * one it wants (This may be the one right in front of it).*/
-struct sched_resource {
-    struct scheduler   *scheduler;
-    struct cpupool     *cpupool;
-    spinlock_t         *schedule_lock,
-                       _lock;
-    struct sched_unit  *curr;
-    struct sched_unit  *sched_unit_idle;
-    struct sched_unit  *prev;
-    void               *sched_priv;
-    struct timer        s_timer;        /* scheduling timer                */
-
-    /* Cpu with lowest id in scheduling resource. */
-    unsigned int        master_cpu;
-    unsigned int        granularity;
-    cpumask_var_t       cpus;           /* cpus covered by this struct     */
-    struct rcu_head     rcu;
-};
-
-DECLARE_PER_CPU(struct sched_resource *, sched_res);
-extern rcu_read_lock_t sched_res_rculock;
-
-static inline struct sched_resource *get_sched_res(unsigned int cpu)
-{
-    return rcu_dereference(per_cpu(sched_res, cpu));
-}
-
-static inline void set_sched_res(unsigned int cpu, struct sched_resource *res)
-{
-    rcu_assign_pointer(per_cpu(sched_res, cpu), res);
-}
-
-static inline struct sched_unit *curr_on_cpu(unsigned int cpu)
-{
-    return get_sched_res(cpu)->curr;
-}
-
-static inline bool is_idle_unit(const struct sched_unit *unit)
-{
-    return is_idle_vcpu(unit->vcpu_list);
-}
-
-/* Returns true if at least one vcpu of the unit is online. */
-static inline bool is_unit_online(const struct sched_unit *unit)
-{
-    const struct vcpu *v;
-
-    for_each_sched_unit_vcpu ( unit, v )
-        if ( is_vcpu_online(v) )
-            return true;
-
-    return false;
-}
-
-static inline unsigned int unit_running(const struct sched_unit *unit)
-{
-    return unit->runstate_cnt[RUNSTATE_running];
-}
-
-/* Returns true if at least one vcpu of the unit is runnable. */
-static inline bool unit_runnable(const struct sched_unit *unit)
-{
-    const struct vcpu *v;
-
-    for_each_sched_unit_vcpu ( unit, v )
-        if ( vcpu_runnable(v) )
-            return true;
-
-    return false;
-}
-
-static inline int vcpu_runstate_blocked(const struct vcpu *v)
-{
-    return (v->pause_flags & VPF_blocked) ? RUNSTATE_blocked : 
RUNSTATE_offline;
-}
-
-/*
- * Returns whether a sched_unit is runnable and sets new_state for each of its
- * vcpus. It is mandatory to determine the new runstate for all vcpus of a unit
- * without dropping the schedule lock (which happens when synchronizing the
- * context switch of the vcpus of a unit) in order to avoid races with e.g.
- * vcpu_sleep().
- */
-static inline bool unit_runnable_state(const struct sched_unit *unit)
-{
-    struct vcpu *v;
-    bool runnable, ret = false;
-
-    if ( is_idle_unit(unit) )
-        return true;
-
-    for_each_sched_unit_vcpu ( unit, v )
-    {
-        runnable = vcpu_runnable(v);
-
-        v->new_state = runnable ? RUNSTATE_running : vcpu_runstate_blocked(v);
-
-        if ( runnable )
-            ret = true;
-    }
-
-    return ret;
-}
-
-static inline void sched_set_res(struct sched_unit *unit,
-                                 struct sched_resource *res)
-{
-    unsigned int cpu = cpumask_first(res->cpus);
-    struct vcpu *v;
-
-    for_each_sched_unit_vcpu ( unit, v )
-    {
-        ASSERT(cpu < nr_cpu_ids);
-        v->processor = cpu;
-        cpu = cpumask_next(cpu, res->cpus);
-    }
-
-    unit->res = res;
-}
-
-/* Return master cpu of the scheduling resource the unit is assigned to. */
-static inline unsigned int sched_unit_master(const struct sched_unit *unit)
-{
-    return unit->res->master_cpu;
-}
-
-/* Set a bit in pause_flags of all vcpus of a unit. */
-static inline void sched_set_pause_flags(struct sched_unit *unit,
-                                         unsigned int bit)
-{
-    struct vcpu *v;
-
-    for_each_sched_unit_vcpu ( unit, v )
-        __set_bit(bit, &v->pause_flags);
-}
-
-/* Clear a bit in pause_flags of all vcpus of a unit. */
-static inline void sched_clear_pause_flags(struct sched_unit *unit,
-                                           unsigned int bit)
-{
-    struct vcpu *v;
-
-    for_each_sched_unit_vcpu ( unit, v )
-        __clear_bit(bit, &v->pause_flags);
-}
-
-/* Set a bit in pause_flags of all vcpus of a unit via atomic updates. */
-static inline void sched_set_pause_flags_atomic(struct sched_unit *unit,
-                                                unsigned int bit)
-{
-    struct vcpu *v;
-
-    for_each_sched_unit_vcpu ( unit, v )
-        set_bit(bit, &v->pause_flags);
-}
-
-/* Clear a bit in pause_flags of all vcpus of a unit via atomic updates. */
-static inline void sched_clear_pause_flags_atomic(struct sched_unit *unit,
-                                                  unsigned int bit)
-{
-    struct vcpu *v;
-
-    for_each_sched_unit_vcpu ( unit, v )
-        clear_bit(bit, &v->pause_flags);
-}
-
-static inline struct sched_unit *sched_idle_unit(unsigned int cpu)
-{
-    return get_sched_res(cpu)->sched_unit_idle;
-}
-
-static inline unsigned int sched_get_resource_cpu(unsigned int cpu)
-{
-    return get_sched_res(cpu)->master_cpu;
-}
-
-/*
- * Scratch space, for avoiding having too many cpumask_t on the stack.
- * Within each scheduler, when using the scratch mask of one pCPU:
- * - the pCPU must belong to the scheduler,
- * - the caller must own the per-pCPU scheduler lock (a.k.a. runqueue
- *   lock).
- */
-DECLARE_PER_CPU(cpumask_t, cpumask_scratch);
-#define cpumask_scratch        (&this_cpu(cpumask_scratch))
-#define cpumask_scratch_cpu(c) (&per_cpu(cpumask_scratch, c))
-
-#define sched_lock(kind, param, cpu, irq, arg...) \
-static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \
-{ \
-    for ( ; ; ) \
-    { \
-        spinlock_t *lock = get_sched_res(cpu)->schedule_lock; \
-        /* \
-         * v->processor may change when grabbing the lock; but \
-         * per_cpu(v->processor) may also change, if changing cpu pool \
-         * also changes the scheduler lock.  Retry until they match. \
-         * \
-         * It may also be the case that v->processor may change but the \
-         * lock may be the same; this will succeed in that case. \
-         */ \
-        spin_lock##irq(lock, ## arg); \
-        if ( likely(lock == get_sched_res(cpu)->schedule_lock) ) \
-            return lock; \
-        spin_unlock##irq(lock, ## arg); \
-    } \
-}
-
-#define sched_unlock(kind, param, cpu, irq, arg...) \
-static inline void kind##_schedule_unlock##irq(spinlock_t *lock \
-                                               EXTRA_TYPE(arg), param) \
-{ \
-    ASSERT(lock == get_sched_res(cpu)->schedule_lock); \
-    spin_unlock##irq(lock, ## arg); \
-}
-
-#define EXTRA_TYPE(arg)
-sched_lock(pcpu, unsigned int cpu,     cpu, )
-sched_lock(unit, const struct sched_unit *i, i->res->master_cpu, )
-sched_lock(pcpu, unsigned int cpu,     cpu,          _irq)
-sched_lock(unit, const struct sched_unit *i, i->res->master_cpu, _irq)
-sched_unlock(pcpu, unsigned int cpu,     cpu, )
-sched_unlock(unit, const struct sched_unit *i, i->res->master_cpu, )
-sched_unlock(pcpu, unsigned int cpu,     cpu,          _irq)
-sched_unlock(unit, const struct sched_unit *i, i->res->master_cpu, _irq)
-#undef EXTRA_TYPE
-
-#define EXTRA_TYPE(arg) , unsigned long arg
-#define spin_unlock_irqsave spin_unlock_irqrestore
-sched_lock(pcpu, unsigned int cpu,     cpu,          _irqsave, *flags)
-sched_lock(unit, const struct sched_unit *i, i->res->master_cpu, _irqsave, 
*flags)
-#undef spin_unlock_irqsave
-sched_unlock(pcpu, unsigned int cpu,     cpu,          _irqrestore, flags)
-sched_unlock(unit, const struct sched_unit *i, i->res->master_cpu, 
_irqrestore, flags)
-#undef EXTRA_TYPE
-
-#undef sched_unlock
-#undef sched_lock
-
-static inline spinlock_t *pcpu_schedule_trylock(unsigned int cpu)
-{
-    spinlock_t *lock = get_sched_res(cpu)->schedule_lock;
-
-    if ( !spin_trylock(lock) )
-        return NULL;
-    if ( lock == get_sched_res(cpu)->schedule_lock )
-        return lock;
-    spin_unlock(lock);
-    return NULL;
-}
-
-struct scheduler {
-    char *name;             /* full name for this scheduler      */
-    char *opt_name;         /* option name for this scheduler    */
-    unsigned int sched_id;  /* ID for this scheduler             */
-    void *sched_data;       /* global data pointer               */
-
-    int          (*global_init)    (void);
-
-    int          (*init)           (struct scheduler *);
-    void         (*deinit)         (struct scheduler *);
-
-    void         (*free_udata)     (const struct scheduler *, void *);
-    void *       (*alloc_udata)    (const struct scheduler *,
-                                    struct sched_unit *, void *);
-    void         (*free_pdata)     (const struct scheduler *, void *, int);
-    void *       (*alloc_pdata)    (const struct scheduler *, int);
-    void         (*init_pdata)     (const struct scheduler *, void *, int);
-    void         (*deinit_pdata)   (const struct scheduler *, void *, int);
-
-    /* Returns ERR_PTR(-err) for error, NULL for 'nothing needed'. */
-    void *       (*alloc_domdata)  (const struct scheduler *, struct domain *);
-    /* Idempotent. */
-    void         (*free_domdata)   (const struct scheduler *, void *);
-
-    spinlock_t * (*switch_sched)   (struct scheduler *, unsigned int,
-                                    void *, void *);
-
-    /* Activate / deactivate units in a cpu pool */
-    void         (*insert_unit)    (const struct scheduler *,
-                                    struct sched_unit *);
-    void         (*remove_unit)    (const struct scheduler *,
-                                    struct sched_unit *);
-
-    void         (*sleep)          (const struct scheduler *,
-                                    struct sched_unit *);
-    void         (*wake)           (const struct scheduler *,
-                                    struct sched_unit *);
-    void         (*yield)          (const struct scheduler *,
-                                    struct sched_unit *);
-    void         (*context_saved)  (const struct scheduler *,
-                                    struct sched_unit *);
-
-    void         (*do_schedule)    (const struct scheduler *,
-                                    struct sched_unit *, s_time_t,
-                                    bool tasklet_work_scheduled);
-
-    struct sched_resource *(*pick_resource)(const struct scheduler *,
-                                            const struct sched_unit *);
-    void         (*migrate)        (const struct scheduler *,
-                                    struct sched_unit *, unsigned int);
-    int          (*adjust)         (const struct scheduler *, struct domain *,
-                                    struct xen_domctl_scheduler_op *);
-    void         (*adjust_affinity)(const struct scheduler *,
-                                    struct sched_unit *,
-                                    const struct cpumask *,
-                                    const struct cpumask *);
-    int          (*adjust_global)  (const struct scheduler *,
-                                    struct xen_sysctl_scheduler_op *);
-    void         (*dump_settings)  (const struct scheduler *);
-    void         (*dump_cpu_state) (const struct scheduler *, int);
-};
-
-static inline int sched_init(struct scheduler *s)
-{
-    return s->init(s);
-}
-
-static inline void sched_deinit(struct scheduler *s)
-{
-    s->deinit(s);
-}
-
-static inline spinlock_t *sched_switch_sched(struct scheduler *s,
-                                             unsigned int cpu,
-                                             void *pdata, void *vdata)
-{
-    return s->switch_sched(s, cpu, pdata, vdata);
-}
-
-static inline void sched_dump_settings(const struct scheduler *s)
-{
-    if ( s->dump_settings )
-        s->dump_settings(s);
-}
-
-static inline void sched_dump_cpu_state(const struct scheduler *s, int cpu)
-{
-    if ( s->dump_cpu_state )
-        s->dump_cpu_state(s, cpu);
-}
-
-static inline void *sched_alloc_domdata(const struct scheduler *s,
-                                        struct domain *d)
-{
-    return s->alloc_domdata ? s->alloc_domdata(s, d) : NULL;
-}
-
-static inline void sched_free_domdata(const struct scheduler *s,
-                                      void *data)
-{
-    ASSERT(s->free_domdata || !data);
-    if ( s->free_domdata )
-        s->free_domdata(s, data);
-}
-
-static inline void *sched_alloc_pdata(const struct scheduler *s, int cpu)
-{
-    return s->alloc_pdata ? s->alloc_pdata(s, cpu) : NULL;
-}
-
-static inline void sched_free_pdata(const struct scheduler *s, void *data,
-                                    int cpu)
-{
-    ASSERT(s->free_pdata || !data);
-    if ( s->free_pdata )
-        s->free_pdata(s, data, cpu);
-}
-
-static inline void sched_init_pdata(const struct scheduler *s, void *data,
-                                    int cpu)
-{
-    if ( s->init_pdata )
-        s->init_pdata(s, data, cpu);
-}
-
-static inline void sched_deinit_pdata(const struct scheduler *s, void *data,
-                                      int cpu)
-{
-    if ( s->deinit_pdata )
-        s->deinit_pdata(s, data, cpu);
-}
-
-static inline void *sched_alloc_udata(const struct scheduler *s,
-                                      struct sched_unit *unit, void *dom_data)
-{
-    return s->alloc_udata(s, unit, dom_data);
-}
-
-static inline void sched_free_udata(const struct scheduler *s, void *data)
-{
-    s->free_udata(s, data);
-}
-
-static inline void sched_insert_unit(const struct scheduler *s,
-                                     struct sched_unit *unit)
-{
-    if ( s->insert_unit )
-        s->insert_unit(s, unit);
-}
-
-static inline void sched_remove_unit(const struct scheduler *s,
-                                     struct sched_unit *unit)
-{
-    if ( s->remove_unit )
-        s->remove_unit(s, unit);
-}
-
-static inline void sched_sleep(const struct scheduler *s,
-                               struct sched_unit *unit)
-{
-    if ( s->sleep )
-        s->sleep(s, unit);
-}
-
-static inline void sched_wake(const struct scheduler *s,
-                              struct sched_unit *unit)
-{
-    if ( s->wake )
-        s->wake(s, unit);
-}
-
-static inline void sched_yield(const struct scheduler *s,
-                               struct sched_unit *unit)
-{
-    if ( s->yield )
-        s->yield(s, unit);
-}
-
-static inline void sched_context_saved(const struct scheduler *s,
-                                       struct sched_unit *unit)
-{
-    if ( s->context_saved )
-        s->context_saved(s, unit);
-}
-
-static inline void sched_migrate(const struct scheduler *s,
-                                 struct sched_unit *unit, unsigned int cpu)
-{
-    if ( s->migrate )
-        s->migrate(s, unit, cpu);
-    else
-        sched_set_res(unit, get_sched_res(cpu));
-}
-
-static inline struct sched_resource *sched_pick_resource(
-    const struct scheduler *s, const struct sched_unit *unit)
-{
-    return s->pick_resource(s, unit);
-}
-
-static inline void sched_adjust_affinity(const struct scheduler *s,
-                                         struct sched_unit *unit,
-                                         const cpumask_t *hard,
-                                         const cpumask_t *soft)
-{
-    if ( s->adjust_affinity )
-        s->adjust_affinity(s, unit, hard, soft);
-}
-
-static inline int sched_adjust_dom(const struct scheduler *s, struct domain *d,
-                                   struct xen_domctl_scheduler_op *op)
-{
-    return s->adjust ? s->adjust(s, d, op) : 0;
-}
-
-static inline int sched_adjust_cpupool(const struct scheduler *s,
-                                       struct xen_sysctl_scheduler_op *op)
-{
-    return s->adjust_global ? s->adjust_global(s, op) : 0;
-}
-
-static inline void sched_unit_pause_nosync(const struct sched_unit *unit)
-{
-    struct vcpu *v;
-
-    for_each_sched_unit_vcpu ( unit, v )
-        vcpu_pause_nosync(v);
-}
-
-static inline void sched_unit_unpause(const struct sched_unit *unit)
-{
-    struct vcpu *v;
-
-    for_each_sched_unit_vcpu ( unit, v )
-        vcpu_unpause(v);
-}
-
-#define REGISTER_SCHEDULER(x) static const struct scheduler *x##_entry \
-  __used_section(".data.schedulers") = &x;
-
-struct cpupool
-{
-    int              cpupool_id;
-    unsigned int     n_dom;
-    cpumask_var_t    cpu_valid;      /* all cpus assigned to pool */
-    cpumask_var_t    res_valid;      /* all scheduling resources of pool */
-    struct cpupool   *next;
-    struct scheduler *sched;
-    atomic_t         refcnt;
-    enum sched_gran  gran;
-};
-
-static inline cpumask_t *cpupool_domain_master_cpumask(const struct domain *d)
-{
-    /*
-     * d->cpupool is NULL only for the idle domain, and no one should
-     * be interested in calling this for the idle domain.
-     */
-    ASSERT(d->cpupool != NULL);
-    return d->cpupool->res_valid;
-}
-
-unsigned int cpupool_get_granularity(const struct cpupool *c);
-
-/*
- * Hard and soft affinity load balancing.
- *
- * Idea is each vcpu has some pcpus that it prefers, some that it does not
- * prefer but is OK with, and some that it cannot run on at all. The first
- * set of pcpus are the ones that are both in the soft affinity *and* in the
- * hard affinity; the second set of pcpus are the ones that are in the hard
- * affinity but *not* in the soft affinity; the third set of pcpus are the
- * ones that are not in the hard affinity.
- *
- * We implement a two step balancing logic. Basically, every time there is
- * the need to decide where to run a vcpu, we first check the soft affinity
- * (well, actually, the && between soft and hard affinity), to see if we can
- * send it where it prefers to (and can) run on. However, if the first step
- * does not find any suitable and free pcpu, we fall back checking the hard
- * affinity.
- */
-#define BALANCE_SOFT_AFFINITY    0
-#define BALANCE_HARD_AFFINITY    1
-
-#define for_each_affinity_balance_step(step) \
-    for ( (step) = 0; (step) <= BALANCE_HARD_AFFINITY; (step)++ )
-
-/*
- * Hard affinity balancing is always necessary and must never be skipped.
- * But soft affinity need only be considered when it has a functionally
- * different effect than other constraints (such as hard affinity, cpus
- * online, or cpupools).
- *
- * Soft affinity only needs to be considered if:
- * * The cpus in the cpupool are not a subset of soft affinity
- * * The hard affinity is not a subset of soft affinity
- * * There is an overlap between the soft and hard affinity masks
- */
-static inline int has_soft_affinity(const struct sched_unit *unit)
-{
-    return unit->soft_aff_effective &&
-           !cpumask_subset(cpupool_domain_master_cpumask(unit->domain),
-                           unit->cpu_soft_affinity);
-}
-
-/*
- * This function copies in mask the cpumask that should be used for a
- * particular affinity balancing step. For the soft affinity one, the pcpus
- * that are not part of vc's hard affinity are filtered out from the result,
- * to avoid running a vcpu where it would like, but is not allowed to!
- */
-static inline void
-affinity_balance_cpumask(const struct sched_unit *unit, int step,
-                         cpumask_t *mask)
-{
-    if ( step == BALANCE_SOFT_AFFINITY )
-    {
-        cpumask_and(mask, unit->cpu_soft_affinity, unit->cpu_hard_affinity);
-
-        if ( unlikely(cpumask_empty(mask)) )
-            cpumask_copy(mask, unit->cpu_hard_affinity);
-    }
-    else /* step == BALANCE_HARD_AFFINITY */
-        cpumask_copy(mask, unit->cpu_hard_affinity);
-}
-
-void sched_rm_cpu(unsigned int cpu);
-const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu);
-
-#endif /* __XEN_SCHED_IF_H__ */
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index cc942a3621..2d8ff366bc 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -50,6 +50,9 @@ DEFINE_XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t);
 /* A global pointer to the hardware domain (usually DOM0). */
 extern struct domain *hardware_domain;
 
+/* A global pointer to the initial cpupool (POOL0). */
+extern struct cpupool *cpupool0;
+
 #ifdef CONFIG_LATE_HWDOM
 extern domid_t hardware_domid;
 #else
@@ -931,6 +934,8 @@ int vcpu_temporary_affinity(struct vcpu *v, unsigned int 
cpu, uint8_t reason);
 int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity);
 int vcpu_set_soft_affinity(struct vcpu *v, const cpumask_t *affinity);
 void restore_vcpu_affinity(struct domain *d);
+int vcpu_affinity_domctl(struct domain *d, uint32_t cmd,
+                         struct xen_domctl_vcpuaffinity *vcpuaff);
 
 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
 uint64_t get_cpu_idle_time(unsigned int cpu);
@@ -1068,6 +1073,8 @@ int cpupool_add_domain(struct domain *d, int poolid);
 void cpupool_rm_domain(struct domain *d);
 int cpupool_move_domain(struct domain *d, struct cpupool *c);
 int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op);
+int cpupool_get_id(const struct domain *d);
+const cpumask_t *cpupool_valid_cpus(const struct cpupool *pool);
 void schedule_dump(struct cpupool *c);
 extern void dump_runq(unsigned char key);
 
--
generated by git-patchbot for /home/xen/git/xen.git#staging

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/xen-changelog

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.