|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen master] xen/sched: move schedulers and cpupool coding to dedicated directory
commit 6cb4b01c033b7abc3e7175501330dfb01fb09da5
Author: Juergen Gross <jgross@xxxxxxxx>
AuthorDate: Wed Jan 22 15:06:43 2020 +0100
Commit: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
CommitDate: Wed Jan 22 17:37:11 2020 +0000
xen/sched: move schedulers and cpupool coding to dedicated directory
Move sched*c and cpupool.c to a new directory common/sched.
Signed-off-by: Juergen Gross <jgross@xxxxxxxx>
Reviewed-by: Dario Faggioli <dfaggioli@xxxxxxxx>
---
MAINTAINERS | 8 +-
xen/common/Kconfig | 66 +-
xen/common/Makefile | 8 +-
xen/common/compat/schedule.c | 55 -
xen/common/cpupool.c | 979 ----------
xen/common/sched/Kconfig | 65 +
xen/common/sched/Makefile | 7 +
xen/common/sched/arinc653.c | 739 ++++++++
xen/common/sched/compat.c | 55 +
xen/common/sched/core.c | 3144 ++++++++++++++++++++++++++++++++
xen/common/sched/cpupool.c | 979 ++++++++++
xen/common/sched/credit.c | 2284 +++++++++++++++++++++++
xen/common/sched/credit2.c | 4122 ++++++++++++++++++++++++++++++++++++++++++
xen/common/sched/null.c | 1034 +++++++++++
xen/common/sched/rt.c | 1571 ++++++++++++++++
xen/common/sched_arinc653.c | 739 --------
xen/common/sched_credit.c | 2284 -----------------------
xen/common/sched_credit2.c | 4122 ------------------------------------------
xen/common/sched_null.c | 1034 -----------
xen/common/sched_rt.c | 1571 ----------------
xen/common/schedule.c | 3144 --------------------------------
21 files changed, 14006 insertions(+), 14004 deletions(-)
diff --git a/MAINTAINERS b/MAINTAINERS
index a91080cde5..dadcfb63d8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -174,7 +174,7 @@ M: Josh Whitehead <josh.whitehead@xxxxxxxxxxxxxxx>
M: Stewart Hildebrand <stewart.hildebrand@xxxxxxxxxxxxxxx>
S: Supported
L: xen-devel@xxxxxxxxxxxxxxx
-F: xen/common/sched_arinc653.c
+F: xen/common/sched/arinc653.c
F: tools/libxc/xc_arinc653.c
ARM (W/ VIRTUALISATION EXTENSIONS) ARCHITECTURE
@@ -218,7 +218,7 @@ CPU POOLS
M: Juergen Gross <jgross@xxxxxxxx>
M: Dario Faggioli <dfaggioli@xxxxxxxx>
S: Supported
-F: xen/common/cpupool.c
+F: xen/common/sched/cpupool.c
DEVICE TREE
M: Stefano Stabellini <sstabellini@xxxxxxxxxx>
@@ -384,13 +384,13 @@ RTDS SCHEDULER
M: Dario Faggioli <dfaggioli@xxxxxxxx>
M: Meng Xu <mengxu@xxxxxxxxxxxxx>
S: Supported
-F: xen/common/sched_rt.c
+F: xen/common/sched/rt.c
SCHEDULING
M: George Dunlap <george.dunlap@xxxxxxxxxxxxx>
M: Dario Faggioli <dfaggioli@xxxxxxxx>
S: Supported
-F: xen/common/sched*
+F: xen/common/sched/
SEABIOS UPSTREAM
M: Wei Liu <wl@xxxxxxx>
diff --git a/xen/common/Kconfig b/xen/common/Kconfig
index b3d161d057..9d6d09eb37 100644
--- a/xen/common/Kconfig
+++ b/xen/common/Kconfig
@@ -275,71 +275,7 @@ config ARGO
If unsure, say N.
-menu "Schedulers"
- visible if EXPERT = "y"
-
-config SCHED_CREDIT
- bool "Credit scheduler support"
- default y
- ---help---
- The traditional credit scheduler is a general purpose scheduler.
-
-config SCHED_CREDIT2
- bool "Credit2 scheduler support"
- default y
- ---help---
- The credit2 scheduler is a general purpose scheduler that is
- optimized for lower latency and higher VM density.
-
-config SCHED_RTDS
- bool "RTDS scheduler support (EXPERIMENTAL)"
- default y
- ---help---
- The RTDS scheduler is a soft and firm real-time scheduler for
- multicore, targeted for embedded, automotive, graphics and gaming
- in the cloud, and general low-latency workloads.
-
-config SCHED_ARINC653
- bool "ARINC653 scheduler support (EXPERIMENTAL)"
- default DEBUG
- ---help---
- The ARINC653 scheduler is a hard real-time scheduler for single
- cores, targeted for avionics, drones, and medical devices.
-
-config SCHED_NULL
- bool "Null scheduler support (EXPERIMENTAL)"
- default y
- ---help---
- The null scheduler is a static, zero overhead scheduler,
- for when there always are less vCPUs than pCPUs, typically
- in embedded or HPC scenarios.
-
-choice
- prompt "Default Scheduler?"
- default SCHED_CREDIT2_DEFAULT
-
- config SCHED_CREDIT_DEFAULT
- bool "Credit Scheduler" if SCHED_CREDIT
- config SCHED_CREDIT2_DEFAULT
- bool "Credit2 Scheduler" if SCHED_CREDIT2
- config SCHED_RTDS_DEFAULT
- bool "RT Scheduler" if SCHED_RTDS
- config SCHED_ARINC653_DEFAULT
- bool "ARINC653 Scheduler" if SCHED_ARINC653
- config SCHED_NULL_DEFAULT
- bool "Null Scheduler" if SCHED_NULL
-endchoice
-
-config SCHED_DEFAULT
- string
- default "credit" if SCHED_CREDIT_DEFAULT
- default "credit2" if SCHED_CREDIT2_DEFAULT
- default "rtds" if SCHED_RTDS_DEFAULT
- default "arinc653" if SCHED_ARINC653_DEFAULT
- default "null" if SCHED_NULL_DEFAULT
- default "credit2"
-
-endmenu
+source "common/sched/Kconfig"
config CRYPTO
bool
diff --git a/xen/common/Makefile b/xen/common/Makefile
index 62b34e69e9..2abb8250b0 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -3,7 +3,6 @@ obj-y += bitmap.o
obj-y += bsearch.o
obj-$(CONFIG_CORE_PARKING) += core_parking.o
obj-y += cpu.o
-obj-y += cpupool.o
obj-$(CONFIG_DEBUG_TRACE) += debugtrace.o
obj-$(CONFIG_HAS_DEVICE_TREE) += device_tree.o
obj-y += domctl.o
@@ -38,12 +37,6 @@ obj-y += radix-tree.o
obj-y += rbtree.o
obj-y += rcupdate.o
obj-y += rwlock.o
-obj-$(CONFIG_SCHED_ARINC653) += sched_arinc653.o
-obj-$(CONFIG_SCHED_CREDIT) += sched_credit.o
-obj-$(CONFIG_SCHED_CREDIT2) += sched_credit2.o
-obj-$(CONFIG_SCHED_RTDS) += sched_rt.o
-obj-$(CONFIG_SCHED_NULL) += sched_null.o
-obj-y += schedule.o
obj-y += shutdown.o
obj-y += softirq.o
obj-y += sort.o
@@ -74,6 +67,7 @@ obj-$(CONFIG_COMPAT) += $(addprefix compat/,domain.o kernel.o
memory.o multicall
extra-y := symbols-dummy.o
subdir-$(CONFIG_COVERAGE) += coverage
+subdir-y += sched
subdir-$(CONFIG_UBSAN) += ubsan
subdir-$(CONFIG_NEEDS_LIBELF) += libelf
diff --git a/xen/common/compat/schedule.c b/xen/common/compat/schedule.c
deleted file mode 100644
index 8b6e6f107d..0000000000
--- a/xen/common/compat/schedule.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/****************************************************************************
- * schedule.c
- *
- */
-
-#include <compat/sched.h>
-
-#define COMPAT
-#define ret_t int
-
-#define do_sched_op compat_sched_op
-
-#define xen_sched_pin_override sched_pin_override
-CHECK_sched_pin_override;
-#undef xen_sched_pin_override
-
-#define xen_sched_shutdown sched_shutdown
-CHECK_sched_shutdown;
-#undef xen_sched_shutdown
-
-#define xen_sched_remote_shutdown sched_remote_shutdown
-CHECK_sched_remote_shutdown;
-#undef xen_sched_remote_shutdown
-
-static int compat_poll(struct compat_sched_poll *compat)
-{
- struct sched_poll native;
-
-#define XLAT_sched_poll_HNDL_ports(_d_, _s_) \
- guest_from_compat_handle((_d_)->ports, (_s_)->ports)
- XLAT_sched_poll(&native, compat);
-#undef XLAT_sched_poll_HNDL_ports
-
- return do_poll(&native);
-}
-
-#define do_poll compat_poll
-#define sched_poll compat_sched_poll
-
-#include "../schedule.c"
-
-int compat_set_timer_op(u32 lo, s32 hi)
-{
- return do_set_timer_op(((s64)hi << 32) | lo);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c
deleted file mode 100644
index d66b541a94..0000000000
--- a/xen/common/cpupool.c
+++ /dev/null
@@ -1,979 +0,0 @@
-/******************************************************************************
- * cpupool.c
- *
- * Generic cpupool-handling functions.
- *
- * Cpupools are a feature to have configurable scheduling domains. Each
- * cpupool runs an own scheduler on a dedicated set of physical cpus.
- * A domain is bound to one cpupool at any time, but it can be moved to
- * another cpupool.
- *
- * (C) 2009, Juergen Gross, Fujitsu Technology Solutions
- */
-
-#include <xen/lib.h>
-#include <xen/init.h>
-#include <xen/cpumask.h>
-#include <xen/percpu.h>
-#include <xen/sched.h>
-#include <xen/sched-if.h>
-#include <xen/warning.h>
-#include <xen/keyhandler.h>
-#include <xen/cpu.h>
-
-#define for_each_cpupool(ptr) \
- for ((ptr) = &cpupool_list; *(ptr) != NULL; (ptr) = &((*(ptr))->next))
-
-struct cpupool *cpupool0; /* Initial cpupool with Dom0 */
-cpumask_t cpupool_free_cpus; /* cpus not in any cpupool */
-
-static struct cpupool *cpupool_list; /* linked list, sorted by poolid */
-
-static int cpupool_moving_cpu = -1;
-static struct cpupool *cpupool_cpu_moving = NULL;
-static cpumask_t cpupool_locked_cpus;
-
-static DEFINE_SPINLOCK(cpupool_lock);
-
-static enum sched_gran __read_mostly opt_sched_granularity = SCHED_GRAN_cpu;
-static unsigned int __read_mostly sched_granularity = 1;
-
-#ifdef CONFIG_HAS_SCHED_GRANULARITY
-static int __init sched_select_granularity(const char *str)
-{
- if ( strcmp("cpu", str) == 0 )
- opt_sched_granularity = SCHED_GRAN_cpu;
- else if ( strcmp("core", str) == 0 )
- opt_sched_granularity = SCHED_GRAN_core;
- else if ( strcmp("socket", str) == 0 )
- opt_sched_granularity = SCHED_GRAN_socket;
- else
- return -EINVAL;
-
- return 0;
-}
-custom_param("sched-gran", sched_select_granularity);
-#endif
-
-static unsigned int __init cpupool_check_granularity(void)
-{
- unsigned int cpu;
- unsigned int siblings, gran = 0;
-
- if ( opt_sched_granularity == SCHED_GRAN_cpu )
- return 1;
-
- for_each_online_cpu ( cpu )
- {
- siblings = cpumask_weight(sched_get_opt_cpumask(opt_sched_granularity,
- cpu));
- if ( gran == 0 )
- gran = siblings;
- else if ( gran != siblings )
- return 0;
- }
-
- sched_disable_smt_switching = true;
-
- return gran;
-}
-
-/* Setup data for selected scheduler granularity. */
-static void __init cpupool_gran_init(void)
-{
- unsigned int gran = 0;
- const char *fallback = NULL;
-
- while ( gran == 0 )
- {
- gran = cpupool_check_granularity();
-
- if ( gran == 0 )
- {
- switch ( opt_sched_granularity )
- {
- case SCHED_GRAN_core:
- opt_sched_granularity = SCHED_GRAN_cpu;
- fallback = "Asymmetric cpu configuration.\n"
- "Falling back to sched-gran=cpu.\n";
- break;
- case SCHED_GRAN_socket:
- opt_sched_granularity = SCHED_GRAN_core;
- fallback = "Asymmetric cpu configuration.\n"
- "Falling back to sched-gran=core.\n";
- break;
- default:
- ASSERT_UNREACHABLE();
- break;
- }
- }
- }
-
- if ( fallback )
- warning_add(fallback);
-
- sched_granularity = gran;
-}
-
-unsigned int cpupool_get_granularity(const struct cpupool *c)
-{
- return c ? sched_granularity : 1;
-}
-
-static void free_cpupool_struct(struct cpupool *c)
-{
- if ( c )
- {
- free_cpumask_var(c->res_valid);
- free_cpumask_var(c->cpu_valid);
- }
- xfree(c);
-}
-
-static struct cpupool *alloc_cpupool_struct(void)
-{
- struct cpupool *c = xzalloc(struct cpupool);
-
- if ( !c )
- return NULL;
-
- if ( !zalloc_cpumask_var(&c->cpu_valid) ||
- !zalloc_cpumask_var(&c->res_valid) )
- {
- free_cpupool_struct(c);
- c = NULL;
- }
-
- return c;
-}
-
-/*
- * find a cpupool by it's id. to be called with cpupool lock held
- * if exact is not specified, the first cpupool with an id larger or equal to
- * the searched id is returned
- * returns NULL if not found.
- */
-static struct cpupool *__cpupool_find_by_id(int id, int exact)
-{
- struct cpupool **q;
-
- ASSERT(spin_is_locked(&cpupool_lock));
-
- for_each_cpupool(q)
- if ( (*q)->cpupool_id >= id )
- break;
-
- return (!exact || (*q == NULL) || ((*q)->cpupool_id == id)) ? *q : NULL;
-}
-
-static struct cpupool *cpupool_find_by_id(int poolid)
-{
- return __cpupool_find_by_id(poolid, 1);
-}
-
-static struct cpupool *__cpupool_get_by_id(int poolid, int exact)
-{
- struct cpupool *c;
- spin_lock(&cpupool_lock);
- c = __cpupool_find_by_id(poolid, exact);
- if ( c != NULL )
- atomic_inc(&c->refcnt);
- spin_unlock(&cpupool_lock);
- return c;
-}
-
-struct cpupool *cpupool_get_by_id(int poolid)
-{
- return __cpupool_get_by_id(poolid, 1);
-}
-
-static struct cpupool *cpupool_get_next_by_id(int poolid)
-{
- return __cpupool_get_by_id(poolid, 0);
-}
-
-void cpupool_put(struct cpupool *pool)
-{
- if ( !atomic_dec_and_test(&pool->refcnt) )
- return;
- scheduler_free(pool->sched);
- free_cpupool_struct(pool);
-}
-
-/*
- * create a new cpupool with specified poolid and scheduler
- * returns pointer to new cpupool structure if okay, NULL else
- * possible failures:
- * - no memory
- * - poolid already used
- * - unknown scheduler
- */
-static struct cpupool *cpupool_create(
- int poolid, unsigned int sched_id, int *perr)
-{
- struct cpupool *c;
- struct cpupool **q;
- int last = 0;
-
- *perr = -ENOMEM;
- if ( (c = alloc_cpupool_struct()) == NULL )
- return NULL;
-
- /* One reference for caller, one reference for cpupool_destroy(). */
- atomic_set(&c->refcnt, 2);
-
- debugtrace_printk("cpupool_create(pool=%d,sched=%u)\n", poolid, sched_id);
-
- spin_lock(&cpupool_lock);
-
- for_each_cpupool(q)
- {
- last = (*q)->cpupool_id;
- if ( (poolid != CPUPOOLID_NONE) && (last >= poolid) )
- break;
- }
- if ( *q != NULL )
- {
- if ( (*q)->cpupool_id == poolid )
- {
- *perr = -EEXIST;
- goto err;
- }
- c->next = *q;
- }
-
- c->cpupool_id = (poolid == CPUPOOLID_NONE) ? (last + 1) : poolid;
- if ( poolid == 0 )
- {
- c->sched = scheduler_get_default();
- }
- else
- {
- c->sched = scheduler_alloc(sched_id, perr);
- if ( c->sched == NULL )
- goto err;
- }
- c->gran = opt_sched_granularity;
-
- *q = c;
-
- spin_unlock(&cpupool_lock);
-
- debugtrace_printk("Created cpupool %d with scheduler %s (%s)\n",
- c->cpupool_id, c->sched->name, c->sched->opt_name);
-
- *perr = 0;
- return c;
-
- err:
- spin_unlock(&cpupool_lock);
- free_cpupool_struct(c);
- return NULL;
-}
-/*
- * destroys the given cpupool
- * returns 0 on success, 1 else
- * possible failures:
- * - pool still in use
- * - cpus still assigned to pool
- * - pool not in list
- */
-static int cpupool_destroy(struct cpupool *c)
-{
- struct cpupool **q;
-
- spin_lock(&cpupool_lock);
- for_each_cpupool(q)
- if ( *q == c )
- break;
- if ( *q != c )
- {
- spin_unlock(&cpupool_lock);
- return -ENOENT;
- }
- if ( (c->n_dom != 0) || cpumask_weight(c->cpu_valid) )
- {
- spin_unlock(&cpupool_lock);
- return -EBUSY;
- }
- *q = c->next;
- spin_unlock(&cpupool_lock);
-
- cpupool_put(c);
-
- debugtrace_printk("cpupool_destroy(pool=%d)\n", c->cpupool_id);
- return 0;
-}
-
-/*
- * Move domain to another cpupool
- */
-static int cpupool_move_domain_locked(struct domain *d, struct cpupool *c)
-{
- int ret;
-
- if ( unlikely(d->cpupool == c) )
- return 0;
-
- d->cpupool->n_dom--;
- ret = sched_move_domain(d, c);
- if ( ret )
- d->cpupool->n_dom++;
- else
- c->n_dom++;
-
- return ret;
-}
-int cpupool_move_domain(struct domain *d, struct cpupool *c)
-{
- int ret;
-
- spin_lock(&cpupool_lock);
-
- ret = cpupool_move_domain_locked(d, c);
-
- spin_unlock(&cpupool_lock);
-
- return ret;
-}
-
-/*
- * assign a specific cpu to a cpupool
- * cpupool_lock must be held
- */
-static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu)
-{
- int ret;
- struct domain *d;
- const cpumask_t *cpus;
-
- cpus = sched_get_opt_cpumask(c->gran, cpu);
-
- if ( (cpupool_moving_cpu == cpu) && (c != cpupool_cpu_moving) )
- return -EADDRNOTAVAIL;
- ret = schedule_cpu_add(cpumask_first(cpus), c);
- if ( ret )
- return ret;
-
- rcu_read_lock(&sched_res_rculock);
-
- cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
- if (cpupool_moving_cpu == cpu)
- {
- cpupool_moving_cpu = -1;
- cpupool_put(cpupool_cpu_moving);
- cpupool_cpu_moving = NULL;
- }
- cpumask_or(c->cpu_valid, c->cpu_valid, cpus);
- cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask);
-
- rcu_read_unlock(&sched_res_rculock);
-
- rcu_read_lock(&domlist_read_lock);
- for_each_domain_in_cpupool(d, c)
- {
- domain_update_node_affinity(d);
- }
- rcu_read_unlock(&domlist_read_lock);
-
- return 0;
-}
-
-static int cpupool_unassign_cpu_finish(struct cpupool *c)
-{
- int cpu = cpupool_moving_cpu;
- const cpumask_t *cpus;
- struct domain *d;
- int ret;
-
- if ( c != cpupool_cpu_moving )
- return -EADDRNOTAVAIL;
-
- /*
- * We need this for scanning the domain list, both in
- * cpu_disable_scheduler(), and at the bottom of this function.
- */
- rcu_read_lock(&domlist_read_lock);
- ret = cpu_disable_scheduler(cpu);
-
- rcu_read_lock(&sched_res_rculock);
- cpus = get_sched_res(cpu)->cpus;
- cpumask_or(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
-
- /*
- * cpu_disable_scheduler() returning an error doesn't require resetting
- * cpupool_free_cpus' cpu bit. All error cases should be of temporary
- * nature and tools will retry the operation. Even if the number of
- * retries may be limited, the in-between state can easily be repaired
- * by adding the cpu to the cpupool again.
- */
- if ( !ret )
- {
- ret = schedule_cpu_rm(cpu);
- if ( ret )
- cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
- else
- {
- cpupool_moving_cpu = -1;
- cpupool_put(cpupool_cpu_moving);
- cpupool_cpu_moving = NULL;
- }
- }
- rcu_read_unlock(&sched_res_rculock);
-
- for_each_domain_in_cpupool(d, c)
- {
- domain_update_node_affinity(d);
- }
- rcu_read_unlock(&domlist_read_lock);
-
- return ret;
-}
-
-static int cpupool_unassign_cpu_start(struct cpupool *c, unsigned int cpu)
-{
- int ret;
- struct domain *d;
- const cpumask_t *cpus;
-
- spin_lock(&cpupool_lock);
- ret = -EADDRNOTAVAIL;
- if ( ((cpupool_moving_cpu != -1) || !cpumask_test_cpu(cpu, c->cpu_valid))
- && (cpu != cpupool_moving_cpu) )
- goto out;
-
- ret = 0;
- rcu_read_lock(&sched_res_rculock);
- cpus = get_sched_res(cpu)->cpus;
-
- if ( (c->n_dom > 0) &&
- (cpumask_weight(c->cpu_valid) == cpumask_weight(cpus)) &&
- (cpu != cpupool_moving_cpu) )
- {
- rcu_read_lock(&domlist_read_lock);
- for_each_domain_in_cpupool(d, c)
- {
- if ( !d->is_dying && system_state == SYS_STATE_active )
- {
- ret = -EBUSY;
- break;
- }
- ret = cpupool_move_domain_locked(d, cpupool0);
- if ( ret )
- break;
- }
- rcu_read_unlock(&domlist_read_lock);
- if ( ret )
- goto out;
- }
- cpupool_moving_cpu = cpu;
- atomic_inc(&c->refcnt);
- cpupool_cpu_moving = c;
- cpumask_andnot(c->cpu_valid, c->cpu_valid, cpus);
- cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask);
-
- rcu_read_unlock(&domlist_read_lock);
-out:
- spin_unlock(&cpupool_lock);
-
- return ret;
-}
-
-static long cpupool_unassign_cpu_helper(void *info)
-{
- struct cpupool *c = info;
- long ret;
-
- debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n",
- cpupool_cpu_moving->cpupool_id, cpupool_moving_cpu);
- spin_lock(&cpupool_lock);
-
- ret = cpupool_unassign_cpu_finish(c);
-
- spin_unlock(&cpupool_lock);
- debugtrace_printk("cpupool_unassign_cpu ret=%ld\n", ret);
-
- return ret;
-}
-
-/*
- * unassign a specific cpu from a cpupool
- * we must be sure not to run on the cpu to be unassigned! to achieve this
- * the main functionality is performed via continue_hypercall_on_cpu on a
- * specific cpu.
- * if the cpu to be removed is the last one of the cpupool no active domain
- * must be bound to the cpupool. dying domains are moved to cpupool0 as they
- * might be zombies.
- * possible failures:
- * - last cpu and still active domains in cpupool
- * - cpu just being unplugged
- */
-static int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu)
-{
- int work_cpu;
- int ret;
- unsigned int master_cpu;
-
- debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d)\n",
- c->cpupool_id, cpu);
-
- master_cpu = sched_get_resource_cpu(cpu);
- ret = cpupool_unassign_cpu_start(c, master_cpu);
- if ( ret )
- {
- debugtrace_printk("cpupool_unassign_cpu(pool=%d,cpu=%d) ret %d\n",
- c->cpupool_id, cpu, ret);
- return ret;
- }
-
- work_cpu = sched_get_resource_cpu(smp_processor_id());
- if ( work_cpu == master_cpu )
- {
- work_cpu = cpumask_first(cpupool0->cpu_valid);
- if ( work_cpu == master_cpu )
- work_cpu = cpumask_last(cpupool0->cpu_valid);
- }
- return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c);
-}
-
-/*
- * add a new domain to a cpupool
- * possible failures:
- * - pool does not exist
- * - no cpu assigned to pool
- */
-int cpupool_add_domain(struct domain *d, int poolid)
-{
- struct cpupool *c;
- int rc;
- int n_dom = 0;
-
- if ( poolid == CPUPOOLID_NONE )
- return 0;
- spin_lock(&cpupool_lock);
- c = cpupool_find_by_id(poolid);
- if ( c == NULL )
- rc = -ESRCH;
- else if ( !cpumask_weight(c->cpu_valid) )
- rc = -ENODEV;
- else
- {
- c->n_dom++;
- n_dom = c->n_dom;
- d->cpupool = c;
- rc = 0;
- }
- spin_unlock(&cpupool_lock);
- debugtrace_printk("cpupool_add_domain(dom=%d,pool=%d) n_dom %d rc %d\n",
- d->domain_id, poolid, n_dom, rc);
- return rc;
-}
-
-/*
- * remove a domain from a cpupool
- */
-void cpupool_rm_domain(struct domain *d)
-{
- int cpupool_id;
- int n_dom;
-
- if ( d->cpupool == NULL )
- return;
- spin_lock(&cpupool_lock);
- cpupool_id = d->cpupool->cpupool_id;
- d->cpupool->n_dom--;
- n_dom = d->cpupool->n_dom;
- d->cpupool = NULL;
- spin_unlock(&cpupool_lock);
- debugtrace_printk("cpupool_rm_domain(dom=%d,pool=%d) n_dom %d\n",
- d->domain_id, cpupool_id, n_dom);
- return;
-}
-
-/*
- * Called to add a cpu to a pool. CPUs being hot-plugged are added to pool0,
- * as they must have been in there when unplugged.
- */
-static int cpupool_cpu_add(unsigned int cpu)
-{
- int ret = 0;
- const cpumask_t *cpus;
-
- spin_lock(&cpupool_lock);
- cpumask_clear_cpu(cpu, &cpupool_locked_cpus);
- cpumask_set_cpu(cpu, &cpupool_free_cpus);
-
- /*
- * If we are not resuming, we are hot-plugging cpu, and in which case
- * we add it to pool0, as it certainly was there when hot-unplagged
- * (or unplugging would have failed) and that is the default behavior
- * anyway.
- */
- rcu_read_lock(&sched_res_rculock);
- get_sched_res(cpu)->cpupool = NULL;
-
- cpus = sched_get_opt_cpumask(cpupool0->gran, cpu);
- if ( cpumask_subset(cpus, &cpupool_free_cpus) )
- ret = cpupool_assign_cpu_locked(cpupool0, cpu);
-
- rcu_read_unlock(&sched_res_rculock);
-
- spin_unlock(&cpupool_lock);
-
- return ret;
-}
-
-/*
- * This function is called in stop_machine context, so we can be sure no
- * non-idle vcpu is active on the system.
- */
-static void cpupool_cpu_remove(unsigned int cpu)
-{
- int ret;
-
- ASSERT(is_idle_vcpu(current));
-
- if ( !cpumask_test_cpu(cpu, &cpupool_free_cpus) )
- {
- ret = cpupool_unassign_cpu_finish(cpupool0);
- BUG_ON(ret);
- }
- cpumask_clear_cpu(cpu, &cpupool_free_cpus);
-}
-
-/*
- * Called before a CPU is being removed from the system.
- * Removing a CPU is allowed for free CPUs or CPUs in Pool-0 (those are moved
- * to free cpus actually before removing them).
- * The CPU is locked, to forbid adding it again to another cpupool.
- */
-static int cpupool_cpu_remove_prologue(unsigned int cpu)
-{
- int ret = 0;
- cpumask_t *cpus;
- unsigned int master_cpu;
-
- spin_lock(&cpupool_lock);
-
- rcu_read_lock(&sched_res_rculock);
- cpus = get_sched_res(cpu)->cpus;
- master_cpu = sched_get_resource_cpu(cpu);
- if ( cpumask_intersects(cpus, &cpupool_locked_cpus) )
- ret = -EBUSY;
- else
- cpumask_set_cpu(cpu, &cpupool_locked_cpus);
- rcu_read_unlock(&sched_res_rculock);
-
- spin_unlock(&cpupool_lock);
-
- if ( ret )
- return ret;
-
- if ( cpumask_test_cpu(master_cpu, cpupool0->cpu_valid) )
- {
- /* Cpupool0 is populated only after all cpus are up. */
- ASSERT(system_state == SYS_STATE_active);
-
- ret = cpupool_unassign_cpu_start(cpupool0, master_cpu);
- }
- else if ( !cpumask_test_cpu(master_cpu, &cpupool_free_cpus) )
- ret = -ENODEV;
-
- return ret;
-}
-
-/*
- * Called during resume for all cpus which didn't come up again. The cpu must
- * be removed from the cpupool it is assigned to. In case a cpupool will be
- * left without cpu we move all domains of that cpupool to cpupool0.
- * As we are called with all domains still frozen there is no need to take the
- * cpupool lock here.
- */
-static void cpupool_cpu_remove_forced(unsigned int cpu)
-{
- struct cpupool **c;
- int ret;
- unsigned int master_cpu = sched_get_resource_cpu(cpu);
-
- for_each_cpupool ( c )
- {
- if ( cpumask_test_cpu(master_cpu, (*c)->cpu_valid) )
- {
- ret = cpupool_unassign_cpu_start(*c, master_cpu);
- BUG_ON(ret);
- ret = cpupool_unassign_cpu_finish(*c);
- BUG_ON(ret);
- }
- }
-
- cpumask_clear_cpu(cpu, &cpupool_free_cpus);
-
- rcu_read_lock(&sched_res_rculock);
- sched_rm_cpu(cpu);
- rcu_read_unlock(&sched_res_rculock);
-}
-
-/*
- * do cpupool related sysctl operations
- */
-int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op)
-{
- int ret;
- struct cpupool *c;
-
- switch ( op->op )
- {
-
- case XEN_SYSCTL_CPUPOOL_OP_CREATE:
- {
- int poolid;
-
- poolid = (op->cpupool_id == XEN_SYSCTL_CPUPOOL_PAR_ANY) ?
- CPUPOOLID_NONE: op->cpupool_id;
- c = cpupool_create(poolid, op->sched_id, &ret);
- if ( c != NULL )
- {
- op->cpupool_id = c->cpupool_id;
- cpupool_put(c);
- }
- }
- break;
-
- case XEN_SYSCTL_CPUPOOL_OP_DESTROY:
- {
- c = cpupool_get_by_id(op->cpupool_id);
- ret = -ENOENT;
- if ( c == NULL )
- break;
- ret = cpupool_destroy(c);
- cpupool_put(c);
- }
- break;
-
- case XEN_SYSCTL_CPUPOOL_OP_INFO:
- {
- c = cpupool_get_next_by_id(op->cpupool_id);
- ret = -ENOENT;
- if ( c == NULL )
- break;
- op->cpupool_id = c->cpupool_id;
- op->sched_id = c->sched->sched_id;
- op->n_dom = c->n_dom;
- ret = cpumask_to_xenctl_bitmap(&op->cpumap, c->cpu_valid);
- cpupool_put(c);
- }
- break;
-
- case XEN_SYSCTL_CPUPOOL_OP_ADDCPU:
- {
- unsigned cpu;
- const cpumask_t *cpus;
-
- cpu = op->cpu;
- debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d)\n",
- op->cpupool_id, cpu);
-
- spin_lock(&cpupool_lock);
-
- c = cpupool_find_by_id(op->cpupool_id);
- ret = -ENOENT;
- if ( c == NULL )
- goto addcpu_out;
- if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY )
- {
- for_each_cpu ( cpu, &cpupool_free_cpus )
- {
- cpus = sched_get_opt_cpumask(c->gran, cpu);
- if ( cpumask_subset(cpus, &cpupool_free_cpus) )
- break;
- }
- ret = -ENODEV;
- if ( cpu >= nr_cpu_ids )
- goto addcpu_out;
- }
- ret = -EINVAL;
- if ( cpu >= nr_cpu_ids )
- goto addcpu_out;
- ret = -ENODEV;
- cpus = sched_get_opt_cpumask(c->gran, cpu);
- if ( !cpumask_subset(cpus, &cpupool_free_cpus) ||
- cpumask_intersects(cpus, &cpupool_locked_cpus) )
- goto addcpu_out;
- ret = cpupool_assign_cpu_locked(c, cpu);
-
- addcpu_out:
- spin_unlock(&cpupool_lock);
- debugtrace_printk("cpupool_assign_cpu(pool=%d,cpu=%d) ret %d\n",
- op->cpupool_id, cpu, ret);
-
- }
- break;
-
- case XEN_SYSCTL_CPUPOOL_OP_RMCPU:
- {
- unsigned cpu;
-
- c = cpupool_get_by_id(op->cpupool_id);
- ret = -ENOENT;
- if ( c == NULL )
- break;
- cpu = op->cpu;
- if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY )
- cpu = cpumask_last(c->cpu_valid);
- ret = (cpu < nr_cpu_ids) ? cpupool_unassign_cpu(c, cpu) : -EINVAL;
- cpupool_put(c);
- }
- break;
-
- case XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN:
- {
- struct domain *d;
-
- ret = rcu_lock_remote_domain_by_id(op->domid, &d);
- if ( ret )
- break;
- if ( d->cpupool == NULL )
- {
- ret = -EINVAL;
- rcu_unlock_domain(d);
- break;
- }
- if ( op->cpupool_id == d->cpupool->cpupool_id )
- {
- ret = 0;
- rcu_unlock_domain(d);
- break;
- }
- debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d\n",
- d->domain_id, op->cpupool_id);
- ret = -ENOENT;
- spin_lock(&cpupool_lock);
-
- c = cpupool_find_by_id(op->cpupool_id);
- if ( (c != NULL) && cpumask_weight(c->cpu_valid) )
- ret = cpupool_move_domain_locked(d, c);
-
- spin_unlock(&cpupool_lock);
- debugtrace_printk("cpupool move_domain(dom=%d)->pool=%d ret %d\n",
- d->domain_id, op->cpupool_id, ret);
- rcu_unlock_domain(d);
- }
- break;
-
- case XEN_SYSCTL_CPUPOOL_OP_FREEINFO:
- {
- ret = cpumask_to_xenctl_bitmap(
- &op->cpumap, &cpupool_free_cpus);
- }
- break;
-
- default:
- ret = -ENOSYS;
- break;
- }
-
- return ret;
-}
-
-void dump_runq(unsigned char key)
-{
- unsigned long flags;
- s_time_t now = NOW();
- struct cpupool **c;
-
- spin_lock(&cpupool_lock);
- local_irq_save(flags);
-
- printk("sched_smt_power_savings: %s\n",
- sched_smt_power_savings? "enabled":"disabled");
- printk("NOW=%"PRI_stime"\n", now);
-
- printk("Online Cpus: %*pbl\n", CPUMASK_PR(&cpu_online_map));
- if ( !cpumask_empty(&cpupool_free_cpus) )
- {
- printk("Free Cpus: %*pbl\n", CPUMASK_PR(&cpupool_free_cpus));
- schedule_dump(NULL);
- }
-
- for_each_cpupool(c)
- {
- printk("Cpupool %d:\n", (*c)->cpupool_id);
- printk("Cpus: %*pbl\n", CPUMASK_PR((*c)->cpu_valid));
- schedule_dump(*c);
- }
-
- local_irq_restore(flags);
- spin_unlock(&cpupool_lock);
-}
-
-static int cpu_callback(
- struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- int rc = 0;
-
- switch ( action )
- {
- case CPU_DOWN_FAILED:
- case CPU_ONLINE:
- if ( system_state <= SYS_STATE_active )
- rc = cpupool_cpu_add(cpu);
- break;
- case CPU_DOWN_PREPARE:
- /* Suspend/Resume don't change assignments of cpus to cpupools. */
- if ( system_state <= SYS_STATE_active )
- rc = cpupool_cpu_remove_prologue(cpu);
- break;
- case CPU_DYING:
- /* Suspend/Resume don't change assignments of cpus to cpupools. */
- if ( system_state <= SYS_STATE_active )
- cpupool_cpu_remove(cpu);
- break;
- case CPU_RESUME_FAILED:
- cpupool_cpu_remove_forced(cpu);
- break;
- default:
- break;
- }
-
- return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
-}
-
-static struct notifier_block cpu_nfb = {
- .notifier_call = cpu_callback
-};
-
-static int __init cpupool_init(void)
-{
- unsigned int cpu;
- int err;
-
- cpupool_gran_init();
-
- cpupool0 = cpupool_create(0, 0, &err);
- BUG_ON(cpupool0 == NULL);
- cpupool_put(cpupool0);
- register_cpu_notifier(&cpu_nfb);
-
- spin_lock(&cpupool_lock);
-
- cpumask_copy(&cpupool_free_cpus, &cpu_online_map);
-
- for_each_cpu ( cpu, &cpupool_free_cpus )
- cpupool_assign_cpu_locked(cpupool0, cpu);
-
- spin_unlock(&cpupool_lock);
-
- return 0;
-}
-__initcall(cpupool_init);
-
-/*
- * Local variables:
- * mode: C
- * c-file-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff --git a/xen/common/sched/Kconfig b/xen/common/sched/Kconfig
new file mode 100644
index 0000000000..883ac87cab
--- /dev/null
+++ b/xen/common/sched/Kconfig
@@ -0,0 +1,65 @@
+menu "Schedulers"
+ visible if EXPERT = "y"
+
+config SCHED_CREDIT
+ bool "Credit scheduler support"
+ default y
+ ---help---
+ The traditional credit scheduler is a general purpose scheduler.
+
+config SCHED_CREDIT2
+ bool "Credit2 scheduler support"
+ default y
+ ---help---
+ The credit2 scheduler is a general purpose scheduler that is
+ optimized for lower latency and higher VM density.
+
+config SCHED_RTDS
+ bool "RTDS scheduler support (EXPERIMENTAL)"
+ default y
+ ---help---
+ The RTDS scheduler is a soft and firm real-time scheduler for
+ multicore, targeted for embedded, automotive, graphics and gaming
+ in the cloud, and general low-latency workloads.
+
+config SCHED_ARINC653
+ bool "ARINC653 scheduler support (EXPERIMENTAL)"
+ default DEBUG
+ ---help---
+ The ARINC653 scheduler is a hard real-time scheduler for single
+ cores, targeted for avionics, drones, and medical devices.
+
+config SCHED_NULL
+ bool "Null scheduler support (EXPERIMENTAL)"
+ default y
+ ---help---
+ The null scheduler is a static, zero overhead scheduler,
+ for when there always are less vCPUs than pCPUs, typically
+ in embedded or HPC scenarios.
+
+choice
+ prompt "Default Scheduler?"
+ default SCHED_CREDIT2_DEFAULT
+
+ config SCHED_CREDIT_DEFAULT
+ bool "Credit Scheduler" if SCHED_CREDIT
+ config SCHED_CREDIT2_DEFAULT
+ bool "Credit2 Scheduler" if SCHED_CREDIT2
+ config SCHED_RTDS_DEFAULT
+ bool "RT Scheduler" if SCHED_RTDS
+ config SCHED_ARINC653_DEFAULT
+ bool "ARINC653 Scheduler" if SCHED_ARINC653
+ config SCHED_NULL_DEFAULT
+ bool "Null Scheduler" if SCHED_NULL
+endchoice
+
+config SCHED_DEFAULT
+ string
+ default "credit" if SCHED_CREDIT_DEFAULT
+ default "credit2" if SCHED_CREDIT2_DEFAULT
+ default "rtds" if SCHED_RTDS_DEFAULT
+ default "arinc653" if SCHED_ARINC653_DEFAULT
+ default "null" if SCHED_NULL_DEFAULT
+ default "credit2"
+
+endmenu
diff --git a/xen/common/sched/Makefile b/xen/common/sched/Makefile
new file mode 100644
index 0000000000..3537f2a68d
--- /dev/null
+++ b/xen/common/sched/Makefile
@@ -0,0 +1,7 @@
+obj-y += cpupool.o
+obj-$(CONFIG_SCHED_ARINC653) += arinc653.o
+obj-$(CONFIG_SCHED_CREDIT) += credit.o
+obj-$(CONFIG_SCHED_CREDIT2) += credit2.o
+obj-$(CONFIG_SCHED_RTDS) += rt.o
+obj-$(CONFIG_SCHED_NULL) += null.o
+obj-y += core.o
diff --git a/xen/common/sched/arinc653.c b/xen/common/sched/arinc653.c
new file mode 100644
index 0000000000..565575c326
--- /dev/null
+++ b/xen/common/sched/arinc653.c
@@ -0,0 +1,739 @@
+/******************************************************************************
+ * sched_arinc653.c
+ *
+ * An ARINC653-compatible scheduling algorithm for use in Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2010, DornerWorks, Ltd. <DornerWorks.com>
+ */
+
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/timer.h>
+#include <xen/softirq.h>
+#include <xen/time.h>
+#include <xen/errno.h>
+#include <xen/list.h>
+#include <xen/guest_access.h>
+#include <public/sysctl.h>
+
+/**************************************************************************
+ * Private Macros *
+ **************************************************************************/
+
+/**
+ * Default timeslice for domain 0.
+ */
+#define DEFAULT_TIMESLICE MILLISECS(10)
+
+/**
+ * Retrieve the idle UNIT for a given physical CPU
+ */
+#define IDLETASK(cpu) (sched_idle_unit(cpu))
+
+/**
+ * Return a pointer to the ARINC 653-specific scheduler data information
+ * associated with the given UNIT (unit)
+ */
+#define AUNIT(unit) ((arinc653_unit_t *)(unit)->priv)
+
+/**
+ * Return the global scheduler private data given the scheduler ops pointer
+ */
+#define SCHED_PRIV(s) ((a653sched_priv_t *)((s)->sched_data))
+
+/**************************************************************************
+ * Private Type Definitions *
+ **************************************************************************/
+
+/**
+ * The arinc653_unit_t structure holds ARINC 653-scheduler-specific
+ * information for all non-idle UNITs
+ */
+typedef struct arinc653_unit_s
+{
+ /* unit points to Xen's struct sched_unit so we can get to it from an
+ * arinc653_unit_t pointer. */
+ struct sched_unit * unit;
+ /* awake holds whether the UNIT has been woken with vcpu_wake() */
+ bool_t awake;
+ /* list holds the linked list information for the list this UNIT
+ * is stored in */
+ struct list_head list;
+} arinc653_unit_t;
+
+/**
+ * The sched_entry_t structure holds a single entry of the
+ * ARINC 653 schedule.
+ */
+typedef struct sched_entry_s
+{
+ /* dom_handle holds the handle ("UUID") for the domain that this
+ * schedule entry refers to. */
+ xen_domain_handle_t dom_handle;
+ /* unit_id holds the UNIT number for the UNIT that this schedule
+ * entry refers to. */
+ int unit_id;
+ /* runtime holds the number of nanoseconds that the UNIT for this
+ * schedule entry should be allowed to run per major frame. */
+ s_time_t runtime;
+ /* unit holds a pointer to the Xen sched_unit structure */
+ struct sched_unit * unit;
+} sched_entry_t;
+
+/**
+ * This structure defines data that is global to an instance of the scheduler
+ */
+typedef struct a653sched_priv_s
+{
+ /* lock for the whole pluggable scheduler, nests inside cpupool_lock */
+ spinlock_t lock;
+
+ /**
+ * This array holds the active ARINC 653 schedule.
+ *
+ * When the system tries to start a new UNIT, this schedule is scanned
+ * to look for a matching (handle, UNIT #) pair. If both the handle (UUID)
+ * and UNIT number match, then the UNIT is allowed to run. Its run time
+ * (per major frame) is given in the third entry of the schedule.
+ */
+ sched_entry_t schedule[ARINC653_MAX_DOMAINS_PER_SCHEDULE];
+
+ /**
+ * This variable holds the number of entries that are valid in
+ * the arinc653_schedule table.
+ *
+ * This is not necessarily the same as the number of domains in the
+ * schedule. A domain could be listed multiple times within the schedule,
+ * or a domain with multiple UNITs could have a different
+ * schedule entry for each UNIT.
+ */
+ unsigned int num_schedule_entries;
+
+ /**
+ * the major frame time for the ARINC 653 schedule.
+ */
+ s_time_t major_frame;
+
+ /**
+ * the time that the next major frame starts
+ */
+ s_time_t next_major_frame;
+
+ /**
+ * pointers to all Xen UNIT structures for iterating through
+ */
+ struct list_head unit_list;
+} a653sched_priv_t;
+
+/**************************************************************************
+ * Helper functions *
+ **************************************************************************/
+
+/**
+ * This function compares two domain handles.
+ *
+ * @param h1 Pointer to handle 1
+ * @param h2 Pointer to handle 2
+ *
+ * @return <ul>
+ * <li> <0: handle 1 is less than handle 2
+ * <li> 0: handle 1 is equal to handle 2
+ * <li> >0: handle 1 is greater than handle 2
+ * </ul>
+ */
+static int dom_handle_cmp(const xen_domain_handle_t h1,
+ const xen_domain_handle_t h2)
+{
+ return memcmp(h1, h2, sizeof(xen_domain_handle_t));
+}
+
+/**
+ * This function searches the unit list to find a UNIT that matches
+ * the domain handle and UNIT ID specified.
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @param handle Pointer to handler
+ * @param unit_id UNIT ID
+ *
+ * @return <ul>
+ * <li> Pointer to the matching UNIT if one is found
+ * <li> NULL otherwise
+ * </ul>
+ */
+static struct sched_unit *find_unit(
+ const struct scheduler *ops,
+ xen_domain_handle_t handle,
+ int unit_id)
+{
+ arinc653_unit_t *aunit;
+
+ /* loop through the unit_list looking for the specified UNIT */
+ list_for_each_entry ( aunit, &SCHED_PRIV(ops)->unit_list, list )
+ if ( (dom_handle_cmp(aunit->unit->domain->handle, handle) == 0)
+ && (unit_id == aunit->unit->unit_id) )
+ return aunit->unit;
+
+ return NULL;
+}
+
+/**
+ * This function updates the pointer to the Xen UNIT structure for each entry
+ * in the ARINC 653 schedule.
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @return <None>
+ */
+static void update_schedule_units(const struct scheduler *ops)
+{
+ unsigned int i, n_entries = SCHED_PRIV(ops)->num_schedule_entries;
+
+ for ( i = 0; i < n_entries; i++ )
+ SCHED_PRIV(ops)->schedule[i].unit =
+ find_unit(ops,
+ SCHED_PRIV(ops)->schedule[i].dom_handle,
+ SCHED_PRIV(ops)->schedule[i].unit_id);
+}
+
+/**
+ * This function is called by the adjust_global scheduler hook to put
+ * in place a new ARINC653 schedule.
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ *
+ * @return <ul>
+ * <li> 0 = success
+ * <li> !0 = error
+ * </ul>
+ */
+static int
+arinc653_sched_set(
+ const struct scheduler *ops,
+ struct xen_sysctl_arinc653_schedule *schedule)
+{
+ a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+ s_time_t total_runtime = 0;
+ unsigned int i;
+ unsigned long flags;
+ int rc = -EINVAL;
+
+ spin_lock_irqsave(&sched_priv->lock, flags);
+
+ /* Check for valid major frame and number of schedule entries. */
+ if ( (schedule->major_frame <= 0)
+ || (schedule->num_sched_entries < 1)
+ || (schedule->num_sched_entries > ARINC653_MAX_DOMAINS_PER_SCHEDULE) )
+ goto fail;
+
+ for ( i = 0; i < schedule->num_sched_entries; i++ )
+ {
+ /* Check for a valid run time. */
+ if ( schedule->sched_entries[i].runtime <= 0 )
+ goto fail;
+
+ /* Add this entry's run time to total run time. */
+ total_runtime += schedule->sched_entries[i].runtime;
+ }
+
+ /*
+ * Error if the major frame is not large enough to run all entries as
+ * indicated by comparing the total run time to the major frame length.
+ */
+ if ( total_runtime > schedule->major_frame )
+ goto fail;
+
+ /* Copy the new schedule into place. */
+ sched_priv->num_schedule_entries = schedule->num_sched_entries;
+ sched_priv->major_frame = schedule->major_frame;
+ for ( i = 0; i < schedule->num_sched_entries; i++ )
+ {
+ memcpy(sched_priv->schedule[i].dom_handle,
+ schedule->sched_entries[i].dom_handle,
+ sizeof(sched_priv->schedule[i].dom_handle));
+ sched_priv->schedule[i].unit_id =
+ schedule->sched_entries[i].vcpu_id;
+ sched_priv->schedule[i].runtime =
+ schedule->sched_entries[i].runtime;
+ }
+ update_schedule_units(ops);
+
+ /*
+ * The newly-installed schedule takes effect immediately. We do not even
+ * wait for the current major frame to expire.
+ *
+ * Signal a new major frame to begin. The next major frame is set up by
+ * the do_schedule callback function when it is next invoked.
+ */
+ sched_priv->next_major_frame = NOW();
+
+ rc = 0;
+
+ fail:
+ spin_unlock_irqrestore(&sched_priv->lock, flags);
+ return rc;
+}
+
+/**
+ * This function is called by the adjust_global scheduler hook to read the
+ * current ARINC 653 schedule
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @return <ul>
+ * <li> 0 = success
+ * <li> !0 = error
+ * </ul>
+ */
+static int
+arinc653_sched_get(
+ const struct scheduler *ops,
+ struct xen_sysctl_arinc653_schedule *schedule)
+{
+ a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+ unsigned int i;
+ unsigned long flags;
+
+ spin_lock_irqsave(&sched_priv->lock, flags);
+
+ schedule->num_sched_entries = sched_priv->num_schedule_entries;
+ schedule->major_frame = sched_priv->major_frame;
+ for ( i = 0; i < sched_priv->num_schedule_entries; i++ )
+ {
+ memcpy(schedule->sched_entries[i].dom_handle,
+ sched_priv->schedule[i].dom_handle,
+ sizeof(sched_priv->schedule[i].dom_handle));
+ schedule->sched_entries[i].vcpu_id = sched_priv->schedule[i].unit_id;
+ schedule->sched_entries[i].runtime = sched_priv->schedule[i].runtime;
+ }
+
+ spin_unlock_irqrestore(&sched_priv->lock, flags);
+
+ return 0;
+}
+
+/**************************************************************************
+ * Scheduler callback functions *
+ **************************************************************************/
+
+/**
+ * This function performs initialization for an instance of the scheduler.
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ *
+ * @return <ul>
+ * <li> 0 = success
+ * <li> !0 = error
+ * </ul>
+ */
+static int
+a653sched_init(struct scheduler *ops)
+{
+ a653sched_priv_t *prv;
+
+ prv = xzalloc(a653sched_priv_t);
+ if ( prv == NULL )
+ return -ENOMEM;
+
+ ops->sched_data = prv;
+
+ prv->next_major_frame = 0;
+ spin_lock_init(&prv->lock);
+ INIT_LIST_HEAD(&prv->unit_list);
+
+ return 0;
+}
+
+/**
+ * This function performs deinitialization for an instance of the scheduler
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ */
+static void
+a653sched_deinit(struct scheduler *ops)
+{
+ xfree(SCHED_PRIV(ops));
+ ops->sched_data = NULL;
+}
+
+/**
+ * This function allocates scheduler-specific data for a UNIT
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @param unit Pointer to struct sched_unit
+ *
+ * @return Pointer to the allocated data
+ */
+static void *
+a653sched_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
+ void *dd)
+{
+ a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+ arinc653_unit_t *svc;
+ unsigned int entry;
+ unsigned long flags;
+
+ /*
+ * Allocate memory for the ARINC 653-specific scheduler data information
+ * associated with the given UNIT (unit).
+ */
+ svc = xmalloc(arinc653_unit_t);
+ if ( svc == NULL )
+ return NULL;
+
+ spin_lock_irqsave(&sched_priv->lock, flags);
+
+ /*
+ * Add every one of dom0's units to the schedule, as long as there are
+ * slots available.
+ */
+ if ( unit->domain->domain_id == 0 )
+ {
+ entry = sched_priv->num_schedule_entries;
+
+ if ( entry < ARINC653_MAX_DOMAINS_PER_SCHEDULE )
+ {
+ sched_priv->schedule[entry].dom_handle[0] = '\0';
+ sched_priv->schedule[entry].unit_id = unit->unit_id;
+ sched_priv->schedule[entry].runtime = DEFAULT_TIMESLICE;
+ sched_priv->schedule[entry].unit = unit;
+
+ sched_priv->major_frame += DEFAULT_TIMESLICE;
+ ++sched_priv->num_schedule_entries;
+ }
+ }
+
+ /*
+ * Initialize our ARINC 653 scheduler-specific information for the UNIT.
+ * The UNIT starts "asleep." When Xen is ready for the UNIT to run, it
+ * will call the vcpu_wake scheduler callback function and our scheduler
+ * will mark the UNIT awake.
+ */
+ svc->unit = unit;
+ svc->awake = 0;
+ if ( !is_idle_unit(unit) )
+ list_add(&svc->list, &SCHED_PRIV(ops)->unit_list);
+ update_schedule_units(ops);
+
+ spin_unlock_irqrestore(&sched_priv->lock, flags);
+
+ return svc;
+}
+
+/**
+ * This function frees scheduler-specific UNIT data
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ */
+static void
+a653sched_free_udata(const struct scheduler *ops, void *priv)
+{
+ a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+ arinc653_unit_t *av = priv;
+ unsigned long flags;
+
+ if (av == NULL)
+ return;
+
+ spin_lock_irqsave(&sched_priv->lock, flags);
+
+ if ( !is_idle_unit(av->unit) )
+ list_del(&av->list);
+
+ xfree(av);
+ update_schedule_units(ops);
+
+ spin_unlock_irqrestore(&sched_priv->lock, flags);
+}
+
+/**
+ * Xen scheduler callback function to sleep a UNIT
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @param unit Pointer to struct sched_unit
+ */
+static void
+a653sched_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
+{
+ if ( AUNIT(unit) != NULL )
+ AUNIT(unit)->awake = 0;
+
+ /*
+ * If the UNIT being put to sleep is the same one that is currently
+ * running, raise a softirq to invoke the scheduler to switch domains.
+ */
+ if ( get_sched_res(sched_unit_master(unit))->curr == unit )
+ cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
+}
+
+/**
+ * Xen scheduler callback function to wake up a UNIT
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @param unit Pointer to struct sched_unit
+ */
+static void
+a653sched_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
+{
+ if ( AUNIT(unit) != NULL )
+ AUNIT(unit)->awake = 1;
+
+ cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
+}
+
+/**
+ * Xen scheduler callback function to select a UNIT to run.
+ * This is the main scheduler routine.
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @param now Current time
+ */
+static void
+a653sched_do_schedule(
+ const struct scheduler *ops,
+ struct sched_unit *prev,
+ s_time_t now,
+ bool tasklet_work_scheduled)
+{
+ struct sched_unit *new_task = NULL;
+ static unsigned int sched_index = 0;
+ static s_time_t next_switch_time;
+ a653sched_priv_t *sched_priv = SCHED_PRIV(ops);
+ const unsigned int cpu = sched_get_resource_cpu(smp_processor_id());
+ unsigned long flags;
+
+ spin_lock_irqsave(&sched_priv->lock, flags);
+
+ if ( sched_priv->num_schedule_entries < 1 )
+ sched_priv->next_major_frame = now + DEFAULT_TIMESLICE;
+ else if ( now >= sched_priv->next_major_frame )
+ {
+ /* time to enter a new major frame
+ * the first time this function is called, this will be true */
+ /* start with the first domain in the schedule */
+ sched_index = 0;
+ sched_priv->next_major_frame = now + sched_priv->major_frame;
+ next_switch_time = now + sched_priv->schedule[0].runtime;
+ }
+ else
+ {
+ while ( (now >= next_switch_time)
+ && (sched_index < sched_priv->num_schedule_entries) )
+ {
+ /* time to switch to the next domain in this major frame */
+ sched_index++;
+ next_switch_time += sched_priv->schedule[sched_index].runtime;
+ }
+ }
+
+ /*
+ * If we exhausted the domains in the schedule and still have time left
+ * in the major frame then switch next at the next major frame.
+ */
+ if ( sched_index >= sched_priv->num_schedule_entries )
+ next_switch_time = sched_priv->next_major_frame;
+
+ /*
+ * If there are more domains to run in the current major frame, set
+ * new_task equal to the address of next domain's sched_unit structure.
+ * Otherwise, set new_task equal to the address of the idle task's
+ * sched_unit structure.
+ */
+ new_task = (sched_index < sched_priv->num_schedule_entries)
+ ? sched_priv->schedule[sched_index].unit
+ : IDLETASK(cpu);
+
+ /* Check to see if the new task can be run (awake & runnable). */
+ if ( !((new_task != NULL)
+ && (AUNIT(new_task) != NULL)
+ && AUNIT(new_task)->awake
+ && unit_runnable_state(new_task)) )
+ new_task = IDLETASK(cpu);
+ BUG_ON(new_task == NULL);
+
+ /*
+ * Check to make sure we did not miss a major frame.
+ * This is a good test for robust partitioning.
+ */
+ BUG_ON(now >= sched_priv->next_major_frame);
+
+ spin_unlock_irqrestore(&sched_priv->lock, flags);
+
+ /* Tasklet work (which runs in idle UNIT context) overrides all else. */
+ if ( tasklet_work_scheduled )
+ new_task = IDLETASK(cpu);
+
+ /* Running this task would result in a migration */
+ if ( !is_idle_unit(new_task)
+ && (sched_unit_master(new_task) != cpu) )
+ new_task = IDLETASK(cpu);
+
+ /*
+ * Return the amount of time the next domain has to run and the address
+ * of the selected task's UNIT structure.
+ */
+ prev->next_time = next_switch_time - now;
+ prev->next_task = new_task;
+ new_task->migrated = false;
+
+ BUG_ON(prev->next_time <= 0);
+}
+
+/**
+ * Xen scheduler callback function to select a resource for the UNIT to run on
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @param unit Pointer to struct sched_unit
+ *
+ * @return Scheduler resource to run on
+ */
+static struct sched_resource *
+a653sched_pick_resource(const struct scheduler *ops,
+ const struct sched_unit *unit)
+{
+ cpumask_t *online;
+ unsigned int cpu;
+
+ /*
+ * If present, prefer unit's current processor, else
+ * just find the first valid unit.
+ */
+ online = cpupool_domain_master_cpumask(unit->domain);
+
+ cpu = cpumask_first(online);
+
+ if ( cpumask_test_cpu(sched_unit_master(unit), online)
+ || (cpu >= nr_cpu_ids) )
+ cpu = sched_unit_master(unit);
+
+ return get_sched_res(cpu);
+}
+
+/**
+ * Xen scheduler callback to change the scheduler of a cpu
+ *
+ * @param new_ops Pointer to this instance of the scheduler structure
+ * @param cpu The cpu that is changing scheduler
+ * @param pdata scheduler specific PCPU data (we don't have any)
+ * @param vdata scheduler specific UNIT data of the idle unit
+ */
+static spinlock_t *
+a653_switch_sched(struct scheduler *new_ops, unsigned int cpu,
+ void *pdata, void *vdata)
+{
+ struct sched_resource *sr = get_sched_res(cpu);
+ arinc653_unit_t *svc = vdata;
+
+ ASSERT(!pdata && svc && is_idle_unit(svc->unit));
+
+ sched_idle_unit(cpu)->priv = vdata;
+
+ return &sr->_lock;
+}
+
+/**
+ * Xen scheduler callback function to perform a global (not domain-specific)
+ * adjustment. It is used by the ARINC 653 scheduler to put in place a new
+ * ARINC 653 schedule or to retrieve the schedule currently in place.
+ *
+ * @param ops Pointer to this instance of the scheduler structure
+ * @param sc Pointer to the scheduler operation specified by Domain 0
+ */
+static int
+a653sched_adjust_global(const struct scheduler *ops,
+ struct xen_sysctl_scheduler_op *sc)
+{
+ struct xen_sysctl_arinc653_schedule local_sched;
+ int rc = -EINVAL;
+
+ switch ( sc->cmd )
+ {
+ case XEN_SYSCTL_SCHEDOP_putinfo:
+ if ( copy_from_guest(&local_sched, sc->u.sched_arinc653.schedule, 1) )
+ {
+ rc = -EFAULT;
+ break;
+ }
+
+ rc = arinc653_sched_set(ops, &local_sched);
+ break;
+ case XEN_SYSCTL_SCHEDOP_getinfo:
+ memset(&local_sched, -1, sizeof(local_sched));
+ rc = arinc653_sched_get(ops, &local_sched);
+ if ( rc )
+ break;
+
+ if ( copy_to_guest(sc->u.sched_arinc653.schedule, &local_sched, 1) )
+ rc = -EFAULT;
+ break;
+ }
+
+ return rc;
+}
+
+/**
+ * This structure defines our scheduler for Xen.
+ * The entries tell Xen where to find our scheduler-specific
+ * callback functions.
+ * The symbol must be visible to the rest of Xen at link time.
+ */
+static const struct scheduler sched_arinc653_def = {
+ .name = "ARINC 653 Scheduler",
+ .opt_name = "arinc653",
+ .sched_id = XEN_SCHEDULER_ARINC653,
+ .sched_data = NULL,
+
+ .init = a653sched_init,
+ .deinit = a653sched_deinit,
+
+ .free_udata = a653sched_free_udata,
+ .alloc_udata = a653sched_alloc_udata,
+
+ .insert_unit = NULL,
+ .remove_unit = NULL,
+
+ .sleep = a653sched_unit_sleep,
+ .wake = a653sched_unit_wake,
+ .yield = NULL,
+ .context_saved = NULL,
+
+ .do_schedule = a653sched_do_schedule,
+
+ .pick_resource = a653sched_pick_resource,
+
+ .switch_sched = a653_switch_sched,
+
+ .adjust = NULL,
+ .adjust_global = a653sched_adjust_global,
+
+ .dump_settings = NULL,
+ .dump_cpu_state = NULL,
+};
+
+REGISTER_SCHEDULER(sched_arinc653_def);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/common/sched/compat.c b/xen/common/sched/compat.c
new file mode 100644
index 0000000000..040b4caca2
--- /dev/null
+++ b/xen/common/sched/compat.c
@@ -0,0 +1,55 @@
+/****************************************************************************
+ * schedule.c
+ *
+ */
+
+#include <compat/sched.h>
+
+#define COMPAT
+#define ret_t int
+
+#define do_sched_op compat_sched_op
+
+#define xen_sched_pin_override sched_pin_override
+CHECK_sched_pin_override;
+#undef xen_sched_pin_override
+
+#define xen_sched_shutdown sched_shutdown
+CHECK_sched_shutdown;
+#undef xen_sched_shutdown
+
+#define xen_sched_remote_shutdown sched_remote_shutdown
+CHECK_sched_remote_shutdown;
+#undef xen_sched_remote_shutdown
+
+static int compat_poll(struct compat_sched_poll *compat)
+{
+ struct sched_poll native;
+
+#define XLAT_sched_poll_HNDL_ports(_d_, _s_) \
+ guest_from_compat_handle((_d_)->ports, (_s_)->ports)
+ XLAT_sched_poll(&native, compat);
+#undef XLAT_sched_poll_HNDL_ports
+
+ return do_poll(&native);
+}
+
+#define do_poll compat_poll
+#define sched_poll compat_sched_poll
+
+#include "core.c"
+
+int compat_set_timer_op(u32 lo, s32 hi)
+{
+ return do_set_timer_op(((s64)hi << 32) | lo);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
new file mode 100644
index 0000000000..4d8eb4c617
--- /dev/null
+++ b/xen/common/sched/core.c
@@ -0,0 +1,3144 @@
+/****************************************************************************
+ * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
+ * (C) 2002-2003 University of Cambridge
+ * (C) 2004 - Mark Williamson - Intel Research Cambridge
+ ****************************************************************************
+ *
+ * File: common/schedule.c
+ * Author: Rolf Neugebauer & Keir Fraser
+ * Updated for generic API by Mark Williamson
+ *
+ * Description: Generic CPU scheduling code
+ * implements support functionality for the Xen scheduler API.
+ *
+ */
+
+#ifndef COMPAT
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/domain.h>
+#include <xen/delay.h>
+#include <xen/event.h>
+#include <xen/time.h>
+#include <xen/timer.h>
+#include <xen/perfc.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+#include <xen/trace.h>
+#include <xen/mm.h>
+#include <xen/err.h>
+#include <xen/guest_access.h>
+#include <xen/hypercall.h>
+#include <xen/multicall.h>
+#include <xen/cpu.h>
+#include <xen/preempt.h>
+#include <xen/event.h>
+#include <public/sched.h>
+#include <xsm/xsm.h>
+#include <xen/err.h>
+
+#ifdef CONFIG_XEN_GUEST
+#include <asm/guest.h>
+#else
+#define pv_shim false
+#endif
+
+/* opt_sched: scheduler - default to configured value */
+static char __initdata opt_sched[10] = CONFIG_SCHED_DEFAULT;
+string_param("sched", opt_sched);
+
+/* if sched_smt_power_savings is set,
+ * scheduler will give preferrence to partially idle package compared to
+ * the full idle package, when picking pCPU to schedule vCPU.
+ */
+bool_t sched_smt_power_savings = 0;
+boolean_param("sched_smt_power_savings", sched_smt_power_savings);
+
+/* Default scheduling rate limit: 1ms
+ * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms
is undefined
+ * */
+int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
+integer_param("sched_ratelimit_us", sched_ratelimit_us);
+
+/* Number of vcpus per struct sched_unit. */
+bool __read_mostly sched_disable_smt_switching;
+cpumask_t sched_res_mask;
+
+/* Common lock for free cpus. */
+static DEFINE_SPINLOCK(sched_free_cpu_lock);
+
+/* Various timer handlers. */
+static void s_timer_fn(void *unused);
+static void vcpu_periodic_timer_fn(void *data);
+static void vcpu_singleshot_timer_fn(void *data);
+static void poll_timer_fn(void *data);
+
+/* This is global for now so that private implementations can reach it */
+DEFINE_PER_CPU_READ_MOSTLY(struct sched_resource *, sched_res);
+static DEFINE_PER_CPU_READ_MOSTLY(unsigned int, sched_res_idx);
+DEFINE_RCU_READ_LOCK(sched_res_rculock);
+
+/* Scratch space for cpumasks. */
+DEFINE_PER_CPU(cpumask_t, cpumask_scratch);
+
+/* How many urgent vcpus. */
+DEFINE_PER_CPU(atomic_t, sched_urgent_count);
+
+extern const struct scheduler *__start_schedulers_array[],
*__end_schedulers_array[];
+#define NUM_SCHEDULERS (__end_schedulers_array - __start_schedulers_array)
+#define schedulers __start_schedulers_array
+
+static struct scheduler __read_mostly ops;
+
+static bool scheduler_active;
+
+static void sched_set_affinity(
+ struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft);
+
+static struct sched_resource *
+sched_idle_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
+{
+ return unit->res;
+}
+
+static void *
+sched_idle_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
+ void *dd)
+{
+ /* Any non-NULL pointer is fine here. */
+ return ZERO_BLOCK_PTR;
+}
+
+static void
+sched_idle_free_udata(const struct scheduler *ops, void *priv)
+{
+}
+
+static void sched_idle_schedule(
+ const struct scheduler *ops, struct sched_unit *unit, s_time_t now,
+ bool tasklet_work_scheduled)
+{
+ const unsigned int cpu = smp_processor_id();
+
+ unit->next_time = -1;
+ unit->next_task = sched_idle_unit(cpu);
+}
+
+static struct scheduler sched_idle_ops = {
+ .name = "Idle Scheduler",
+ .opt_name = "idle",
+ .sched_data = NULL,
+
+ .pick_resource = sched_idle_res_pick,
+ .do_schedule = sched_idle_schedule,
+
+ .alloc_udata = sched_idle_alloc_udata,
+ .free_udata = sched_idle_free_udata,
+};
+
+static inline struct vcpu *unit2vcpu_cpu(const struct sched_unit *unit,
+ unsigned int cpu)
+{
+ unsigned int idx = unit->unit_id + per_cpu(sched_res_idx, cpu);
+ const struct domain *d = unit->domain;
+
+ return (idx < d->max_vcpus) ? d->vcpu[idx] : NULL;
+}
+
+static inline struct vcpu *sched_unit2vcpu_cpu(const struct sched_unit *unit,
+ unsigned int cpu)
+{
+ struct vcpu *v = unit2vcpu_cpu(unit, cpu);
+
+ return (v && v->new_state == RUNSTATE_running) ? v : idle_vcpu[cpu];
+}
+
+static inline struct scheduler *dom_scheduler(const struct domain *d)
+{
+ if ( likely(d->cpupool != NULL) )
+ return d->cpupool->sched;
+
+ /*
+ * If d->cpupool is NULL, this is the idle domain. This is special
+ * because the idle domain does not really belong to any cpupool, and,
+ * hence, does not really have a scheduler.
+ *
+ * This is (should be!) only called like this for allocating the idle
+ * vCPUs for the first time, during boot, in which case what we want
+ * is the default scheduler that has been, choosen at boot.
+ */
+ ASSERT(is_idle_domain(d));
+ return &ops;
+}
+
+static inline struct scheduler *unit_scheduler(const struct sched_unit *unit)
+{
+ struct domain *d = unit->domain;
+
+ if ( likely(d->cpupool != NULL) )
+ return d->cpupool->sched;
+
+ /*
+ * If d->cpupool is NULL, this is a unit of the idle domain. And this
+ * case is special because the idle domain does not really belong to
+ * a cpupool and, hence, doesn't really have a scheduler). In fact, its
+ * units (may) run on pCPUs which are in different pools, with different
+ * schedulers.
+ *
+ * What we want, in this case, is the scheduler of the pCPU where this
+ * particular idle unit is running. And, since unit->res never changes
+ * for idle units, it is safe to use it, with no locks, to figure that out.
+ */
+
+ ASSERT(is_idle_domain(d));
+ return unit->res->scheduler;
+}
+
+static inline struct scheduler *vcpu_scheduler(const struct vcpu *v)
+{
+ return unit_scheduler(v->sched_unit);
+}
+#define VCPU2ONLINE(_v) cpupool_domain_master_cpumask((_v)->domain)
+
+static inline void trace_runstate_change(struct vcpu *v, int new_state)
+{
+ struct { uint32_t vcpu:16, domain:16; } d;
+ uint32_t event;
+
+ if ( likely(!tb_init_done) )
+ return;
+
+ d.vcpu = v->vcpu_id;
+ d.domain = v->domain->domain_id;
+
+ event = TRC_SCHED_RUNSTATE_CHANGE;
+ event |= ( v->runstate.state & 0x3 ) << 8;
+ event |= ( new_state & 0x3 ) << 4;
+
+ __trace_var(event, 1/*tsc*/, sizeof(d), &d);
+}
+
+static inline void trace_continue_running(struct vcpu *v)
+{
+ struct { uint32_t vcpu:16, domain:16; } d;
+
+ if ( likely(!tb_init_done) )
+ return;
+
+ d.vcpu = v->vcpu_id;
+ d.domain = v->domain->domain_id;
+
+ __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d);
+}
+
+static inline void vcpu_urgent_count_update(struct vcpu *v)
+{
+ if ( is_idle_vcpu(v) )
+ return;
+
+ if ( unlikely(v->is_urgent) )
+ {
+ if ( !(v->pause_flags & VPF_blocked) ||
+ !test_bit(v->vcpu_id, v->domain->poll_mask) )
+ {
+ v->is_urgent = 0;
+ atomic_dec(&per_cpu(sched_urgent_count, v->processor));
+ }
+ }
+ else
+ {
+ if ( unlikely(v->pause_flags & VPF_blocked) &&
+ unlikely(test_bit(v->vcpu_id, v->domain->poll_mask)) )
+ {
+ v->is_urgent = 1;
+ atomic_inc(&per_cpu(sched_urgent_count, v->processor));
+ }
+ }
+}
+
+static inline void vcpu_runstate_change(
+ struct vcpu *v, int new_state, s_time_t new_entry_time)
+{
+ s_time_t delta;
+ struct sched_unit *unit = v->sched_unit;
+
+ ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
+ if ( v->runstate.state == new_state )
+ return;
+
+ vcpu_urgent_count_update(v);
+
+ trace_runstate_change(v, new_state);
+
+ if ( !is_idle_vcpu(v) )
+ {
+ unit->runstate_cnt[v->runstate.state]--;
+ unit->runstate_cnt[new_state]++;
+ }
+
+ delta = new_entry_time - v->runstate.state_entry_time;
+ if ( delta > 0 )
+ {
+ v->runstate.time[v->runstate.state] += delta;
+ v->runstate.state_entry_time = new_entry_time;
+ }
+
+ v->runstate.state = new_state;
+}
+
+void sched_guest_idle(void (*idle) (void), unsigned int cpu)
+{
+ /*
+ * Another vcpu of the unit is active in guest context while this one is
+ * idle. In case of a scheduling event we don't want to have high latencies
+ * due to a cpu needing to wake up from deep C state for joining the
+ * rendezvous, so avoid those deep C states by incrementing the urgent
+ * count of the cpu.
+ */
+ atomic_inc(&per_cpu(sched_urgent_count, cpu));
+ idle();
+ atomic_dec(&per_cpu(sched_urgent_count, cpu));
+}
+
+void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
+{
+ spinlock_t *lock;
+ s_time_t delta;
+
+ rcu_read_lock(&sched_res_rculock);
+
+ lock = likely(v == current) ? NULL : unit_schedule_lock_irq(v->sched_unit);
+ memcpy(runstate, &v->runstate, sizeof(*runstate));
+ delta = NOW() - runstate->state_entry_time;
+ if ( delta > 0 )
+ runstate->time[runstate->state] += delta;
+
+ if ( unlikely(lock != NULL) )
+ unit_schedule_unlock_irq(lock, v->sched_unit);
+
+ rcu_read_unlock(&sched_res_rculock);
+}
+
+uint64_t get_cpu_idle_time(unsigned int cpu)
+{
+ struct vcpu_runstate_info state = { 0 };
+ struct vcpu *v = idle_vcpu[cpu];
+
+ if ( cpu_online(cpu) && v )
+ vcpu_runstate_get(v, &state);
+
+ return state.time[RUNSTATE_running];
+}
+
+/*
+ * If locks are different, take the one with the lower address first.
+ * This avoids dead- or live-locks when this code is running on both
+ * cpus at the same time.
+ */
+static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2,
+ unsigned long *flags)
+{
+ if ( lock1 == lock2 )
+ {
+ spin_lock_irqsave(lock1, *flags);
+ }
+ else if ( lock1 < lock2 )
+ {
+ spin_lock_irqsave(lock1, *flags);
+ spin_lock(lock2);
+ }
+ else
+ {
+ spin_lock_irqsave(lock2, *flags);
+ spin_lock(lock1);
+ }
+}
+
+static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2,
+ unsigned long flags)
+{
+ if ( lock1 != lock2 )
+ spin_unlock(lock2);
+ spin_unlock_irqrestore(lock1, flags);
+}
+
+static void sched_free_unit_mem(struct sched_unit *unit)
+{
+ struct sched_unit *prev_unit;
+ struct domain *d = unit->domain;
+
+ if ( d->sched_unit_list == unit )
+ d->sched_unit_list = unit->next_in_list;
+ else
+ {
+ for_each_sched_unit ( d, prev_unit )
+ {
+ if ( prev_unit->next_in_list == unit )
+ {
+ prev_unit->next_in_list = unit->next_in_list;
+ break;
+ }
+ }
+ }
+
+ free_cpumask_var(unit->cpu_hard_affinity);
+ free_cpumask_var(unit->cpu_hard_affinity_saved);
+ free_cpumask_var(unit->cpu_soft_affinity);
+
+ xfree(unit);
+}
+
+static void sched_free_unit(struct sched_unit *unit, struct vcpu *v)
+{
+ struct vcpu *vunit;
+ unsigned int cnt = 0;
+
+ /* Don't count to be released vcpu, might be not in vcpu list yet. */
+ for_each_sched_unit_vcpu ( unit, vunit )
+ if ( vunit != v )
+ cnt++;
+
+ v->sched_unit = NULL;
+ unit->runstate_cnt[v->runstate.state]--;
+
+ if ( unit->vcpu_list == v )
+ unit->vcpu_list = v->next_in_list;
+
+ if ( !cnt )
+ sched_free_unit_mem(unit);
+}
+
+static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v)
+{
+ v->sched_unit = unit;
+
+ /* All but idle vcpus are allocated with sequential vcpu_id. */
+ if ( !unit->vcpu_list || unit->vcpu_list->vcpu_id > v->vcpu_id )
+ {
+ unit->vcpu_list = v;
+ /*
+ * unit_id is always the same as lowest vcpu_id of unit.
+ * This is used for stopping for_each_sched_unit_vcpu() loop and in
+ * order to support cpupools with different granularities.
+ */
+ unit->unit_id = v->vcpu_id;
+ }
+ unit->runstate_cnt[v->runstate.state]++;
+}
+
+static struct sched_unit *sched_alloc_unit_mem(void)
+{
+ struct sched_unit *unit;
+
+ unit = xzalloc(struct sched_unit);
+ if ( !unit )
+ return NULL;
+
+ if ( !zalloc_cpumask_var(&unit->cpu_hard_affinity) ||
+ !zalloc_cpumask_var(&unit->cpu_hard_affinity_saved) ||
+ !zalloc_cpumask_var(&unit->cpu_soft_affinity) )
+ {
+ sched_free_unit_mem(unit);
+ unit = NULL;
+ }
+
+ return unit;
+}
+
+static void sched_domain_insert_unit(struct sched_unit *unit, struct domain *d)
+{
+ struct sched_unit **prev_unit;
+
+ unit->domain = d;
+
+ for ( prev_unit = &d->sched_unit_list; *prev_unit;
+ prev_unit = &(*prev_unit)->next_in_list )
+ if ( (*prev_unit)->next_in_list &&
+ (*prev_unit)->next_in_list->unit_id > unit->unit_id )
+ break;
+
+ unit->next_in_list = *prev_unit;
+ *prev_unit = unit;
+}
+
+static struct sched_unit *sched_alloc_unit(struct vcpu *v)
+{
+ struct sched_unit *unit;
+ struct domain *d = v->domain;
+ unsigned int gran = cpupool_get_granularity(d->cpupool);
+
+ for_each_sched_unit ( d, unit )
+ if ( unit->unit_id / gran == v->vcpu_id / gran )
+ break;
+
+ if ( unit )
+ {
+ sched_unit_add_vcpu(unit, v);
+ return unit;
+ }
+
+ if ( (unit = sched_alloc_unit_mem()) == NULL )
+ return NULL;
+
+ sched_unit_add_vcpu(unit, v);
+ sched_domain_insert_unit(unit, d);
+
+ return unit;
+}
+
+static unsigned int sched_select_initial_cpu(const struct vcpu *v)
+{
+ const struct domain *d = v->domain;
+ nodeid_t node;
+ spinlock_t *lock;
+ unsigned long flags;
+ unsigned int cpu_ret, cpu = smp_processor_id();
+ cpumask_t *cpus = cpumask_scratch_cpu(cpu);
+
+ lock = pcpu_schedule_lock_irqsave(cpu, &flags);
+ cpumask_clear(cpus);
+ for_each_node_mask ( node, d->node_affinity )
+ cpumask_or(cpus, cpus, &node_to_cpumask(node));
+ cpumask_and(cpus, cpus, d->cpupool->cpu_valid);
+ if ( cpumask_empty(cpus) )
+ cpumask_copy(cpus, d->cpupool->cpu_valid);
+
+ if ( v->vcpu_id == 0 )
+ cpu_ret = cpumask_first(cpus);
+ else
+ {
+ /* We can rely on previous vcpu being available. */
+ ASSERT(!is_idle_domain(d));
+
+ cpu_ret = cpumask_cycle(d->vcpu[v->vcpu_id - 1]->processor, cpus);
+ }
+
+ pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
+
+ return cpu_ret;
+}
+
+int sched_init_vcpu(struct vcpu *v)
+{
+ struct domain *d = v->domain;
+ struct sched_unit *unit;
+ unsigned int processor;
+
+ if ( (unit = sched_alloc_unit(v)) == NULL )
+ return 1;
+
+ if ( is_idle_domain(d) )
+ processor = v->vcpu_id;
+ else
+ processor = sched_select_initial_cpu(v);
+
+ /* Initialise the per-vcpu timers. */
+ spin_lock_init(&v->periodic_timer_lock);
+ init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, v, processor);
+ init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn, v, processor);
+ init_timer(&v->poll_timer, poll_timer_fn, v, processor);
+
+ /* If this is not the first vcpu of the unit we are done. */
+ if ( unit->priv != NULL )
+ {
+ v->processor = processor;
+ return 0;
+ }
+
+ rcu_read_lock(&sched_res_rculock);
+
+ /* The first vcpu of an unit can be set via sched_set_res(). */
+ sched_set_res(unit, get_sched_res(processor));
+
+ unit->priv = sched_alloc_udata(dom_scheduler(d), unit, d->sched_priv);
+ if ( unit->priv == NULL )
+ {
+ sched_free_unit(unit, v);
+ rcu_read_unlock(&sched_res_rculock);
+ return 1;
+ }
+
+ /*
+ * Initialize affinity settings. The idler, and potentially
+ * domain-0 VCPUs, are pinned onto their respective physical CPUs.
+ */
+ if ( is_idle_domain(d) || (is_hardware_domain(d) && opt_dom0_vcpus_pin) )
+ sched_set_affinity(unit, cpumask_of(processor), &cpumask_all);
+ else
+ sched_set_affinity(unit, &cpumask_all, &cpumask_all);
+
+ /* Idle VCPUs are scheduled immediately, so don't put them in runqueue. */
+ if ( is_idle_domain(d) )
+ {
+ get_sched_res(v->processor)->curr = unit;
+ get_sched_res(v->processor)->sched_unit_idle = unit;
+ v->is_running = 1;
+ unit->is_running = true;
+ unit->state_entry_time = NOW();
+ }
+ else
+ {
+ sched_insert_unit(dom_scheduler(d), unit);
+ }
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ return 0;
+}
+
+static void vcpu_move_irqs(struct vcpu *v)
+{
+ arch_move_irqs(v);
+ evtchn_move_pirqs(v);
+}
+
+static void sched_move_irqs(const struct sched_unit *unit)
+{
+ struct vcpu *v;
+
+ for_each_sched_unit_vcpu ( unit, v )
+ vcpu_move_irqs(v);
+}
+
+int sched_move_domain(struct domain *d, struct cpupool *c)
+{
+ struct vcpu *v;
+ struct sched_unit *unit;
+ unsigned int new_p, unit_idx;
+ void **unit_priv;
+ void *domdata;
+ void *unitdata;
+ struct scheduler *old_ops;
+ void *old_domdata;
+ unsigned int gran = cpupool_get_granularity(c);
+ int ret = 0;
+
+ for_each_vcpu ( d, v )
+ {
+ if ( v->affinity_broken )
+ return -EBUSY;
+ }
+
+ rcu_read_lock(&sched_res_rculock);
+
+ domdata = sched_alloc_domdata(c->sched, d);
+ if ( IS_ERR(domdata) )
+ {
+ ret = PTR_ERR(domdata);
+ goto out;
+ }
+
+ unit_priv = xzalloc_array(void *, DIV_ROUND_UP(d->max_vcpus, gran));
+ if ( unit_priv == NULL )
+ {
+ sched_free_domdata(c->sched, domdata);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ unit_idx = 0;
+ for_each_sched_unit ( d, unit )
+ {
+ unit_priv[unit_idx] = sched_alloc_udata(c->sched, unit, domdata);
+ if ( unit_priv[unit_idx] == NULL )
+ {
+ for ( unit_idx = 0; unit_priv[unit_idx]; unit_idx++ )
+ sched_free_udata(c->sched, unit_priv[unit_idx]);
+ xfree(unit_priv);
+ sched_free_domdata(c->sched, domdata);
+ ret = -ENOMEM;
+ goto out;
+ }
+ unit_idx++;
+ }
+
+ domain_pause(d);
+
+ old_ops = dom_scheduler(d);
+ old_domdata = d->sched_priv;
+
+ for_each_sched_unit ( d, unit )
+ {
+ sched_remove_unit(old_ops, unit);
+ }
+
+ d->cpupool = c;
+ d->sched_priv = domdata;
+
+ new_p = cpumask_first(c->cpu_valid);
+ unit_idx = 0;
+ for_each_sched_unit ( d, unit )
+ {
+ spinlock_t *lock;
+ unsigned int unit_p = new_p;
+
+ unitdata = unit->priv;
+
+ for_each_sched_unit_vcpu ( unit, v )
+ {
+ migrate_timer(&v->periodic_timer, new_p);
+ migrate_timer(&v->singleshot_timer, new_p);
+ migrate_timer(&v->poll_timer, new_p);
+ new_p = cpumask_cycle(new_p, c->cpu_valid);
+ }
+
+ lock = unit_schedule_lock_irq(unit);
+
+ sched_set_affinity(unit, &cpumask_all, &cpumask_all);
+
+ sched_set_res(unit, get_sched_res(unit_p));
+ /*
+ * With v->processor modified we must not
+ * - make any further changes assuming we hold the scheduler lock,
+ * - use unit_schedule_unlock_irq().
+ */
+ spin_unlock_irq(lock);
+
+ unit->priv = unit_priv[unit_idx];
+ if ( !d->is_dying )
+ sched_move_irqs(unit);
+
+ sched_insert_unit(c->sched, unit);
+
+ sched_free_udata(old_ops, unitdata);
+
+ unit_idx++;
+ }
+
+ domain_update_node_affinity(d);
+
+ domain_unpause(d);
+
+ sched_free_domdata(old_ops, old_domdata);
+
+ xfree(unit_priv);
+
+out:
+ rcu_read_unlock(&sched_res_rculock);
+
+ return ret;
+}
+
+void sched_destroy_vcpu(struct vcpu *v)
+{
+ struct sched_unit *unit = v->sched_unit;
+
+ kill_timer(&v->periodic_timer);
+ kill_timer(&v->singleshot_timer);
+ kill_timer(&v->poll_timer);
+ if ( test_and_clear_bool(v->is_urgent) )
+ atomic_dec(&per_cpu(sched_urgent_count, v->processor));
+ /*
+ * Vcpus are being destroyed top-down. So being the first vcpu of an unit
+ * is the same as being the only one.
+ */
+ if ( unit->vcpu_list == v )
+ {
+ rcu_read_lock(&sched_res_rculock);
+
+ sched_remove_unit(vcpu_scheduler(v), unit);
+ sched_free_udata(vcpu_scheduler(v), unit->priv);
+ sched_free_unit(unit, v);
+
+ rcu_read_unlock(&sched_res_rculock);
+ }
+}
+
+int sched_init_domain(struct domain *d, int poolid)
+{
+ void *sdom;
+ int ret;
+
+ ASSERT(d->cpupool == NULL);
+ ASSERT(d->domain_id < DOMID_FIRST_RESERVED);
+
+ if ( (ret = cpupool_add_domain(d, poolid)) )
+ return ret;
+
+ SCHED_STAT_CRANK(dom_init);
+ TRACE_1D(TRC_SCHED_DOM_ADD, d->domain_id);
+
+ rcu_read_lock(&sched_res_rculock);
+
+ sdom = sched_alloc_domdata(dom_scheduler(d), d);
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ if ( IS_ERR(sdom) )
+ return PTR_ERR(sdom);
+
+ d->sched_priv = sdom;
+
+ return 0;
+}
+
+void sched_destroy_domain(struct domain *d)
+{
+ ASSERT(d->domain_id < DOMID_FIRST_RESERVED);
+
+ if ( d->cpupool )
+ {
+ SCHED_STAT_CRANK(dom_destroy);
+ TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id);
+
+ rcu_read_lock(&sched_res_rculock);
+
+ sched_free_domdata(dom_scheduler(d), d->sched_priv);
+ d->sched_priv = NULL;
+
+ rcu_read_unlock(&sched_res_rculock);
+
+ cpupool_rm_domain(d);
+ }
+}
+
+static void vcpu_sleep_nosync_locked(struct vcpu *v)
+{
+ struct sched_unit *unit = v->sched_unit;
+
+ ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
+
+ if ( likely(!vcpu_runnable(v)) )
+ {
+ if ( v->runstate.state == RUNSTATE_runnable )
+ vcpu_runstate_change(v, RUNSTATE_offline, NOW());
+
+ /* Only put unit to sleep in case all vcpus are not runnable. */
+ if ( likely(!unit_runnable(unit)) )
+ sched_sleep(unit_scheduler(unit), unit);
+ else if ( unit_running(unit) > 1 && v->is_running &&
+ !v->force_context_switch )
+ {
+ v->force_context_switch = true;
+ cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ);
+ }
+ }
+}
+
+void vcpu_sleep_nosync(struct vcpu *v)
+{
+ unsigned long flags;
+ spinlock_t *lock;
+
+ TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
+
+ rcu_read_lock(&sched_res_rculock);
+
+ lock = unit_schedule_lock_irqsave(v->sched_unit, &flags);
+
+ vcpu_sleep_nosync_locked(v);
+
+ unit_schedule_unlock_irqrestore(lock, flags, v->sched_unit);
+
+ rcu_read_unlock(&sched_res_rculock);
+}
+
+void vcpu_sleep_sync(struct vcpu *v)
+{
+ vcpu_sleep_nosync(v);
+
+ while ( !vcpu_runnable(v) && v->is_running )
+ cpu_relax();
+
+ sync_vcpu_execstate(v);
+}
+
+void vcpu_wake(struct vcpu *v)
+{
+ unsigned long flags;
+ spinlock_t *lock;
+ struct sched_unit *unit = v->sched_unit;
+
+ TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
+
+ rcu_read_lock(&sched_res_rculock);
+
+ lock = unit_schedule_lock_irqsave(unit, &flags);
+
+ if ( likely(vcpu_runnable(v)) )
+ {
+ if ( v->runstate.state >= RUNSTATE_blocked )
+ vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
+ /*
+ * Call sched_wake() unconditionally, even if unit is running already.
+ * We might have not been de-scheduled after vcpu_sleep_nosync_locked()
+ * and are now to be woken up again.
+ */
+ sched_wake(unit_scheduler(unit), unit);
+ if ( unit->is_running && !v->is_running && !v->force_context_switch )
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/xen-changelog
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |