[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] Problem with simple scheduler



Hi all,

I've spent the last few weeks trying to debug a weird issue with a new scheduler I'm developing for Xen. I have written a barebones round-robin scheduler which seems to work fine when starting up Dom0, but then at some point during the boot everything just hangs (somewhat deterministically from what I can tell from a week of debugging; see below).

I've inlined my source code below. I don't expect anyone to read the whole thing (although it's quite minimal) so here are the key points:
  • I've implemented the following callbacks: init_domain, destroy_domain, insert_vcpu, remove_vcpu, sleep, wake, yield, pick_cpu, do_schedule, init, deinit, alloc_vdata, free_vdata, alloc_pdata, free_pdata, alloc_domdata, free_domdata. Most of these are minimal (or in some cases do nothing). Am I missing anything critical?
  • The hang occurs even if I'm running Dom0 with just a single vcpu. Nothing hangs if I choose a stock scheduler. Either I'm doing something foolish that is causing a deadlock (less likely since the code structure is borrowed from sched_credit.c) or I'm *not* doing something leading to Dom0 crashing and the vcpu just dying.
If you do suspect some specific issue please let me know. Below are some of the possible issues that I've investigated but hit dead ends on:
  • Checking if my debug printk statements were leading to a deadlock due to sleeps in interrupt mode. This doesn't seem to be the case since Dom0 hangs during boot even if I disable all debug output.
  • I suspected incorrect queuing operations that might be corrupting memory somewhere. However, my debug logs tell me that this is not the case. There is at most one element in the runqueue at all times (I use Dom0 with 1 vcpu).
  • I also suspected a deadlock due to incorrect locking. However, based on what the credit scheduler does in sched_credit.c, I'm don't seem to be doing anything significantly different. In general though, which callbacks run in interrupt context?
  • In the end, I stuck debug statements in tick_suspend and tick_resume and after the hang, those get called infinitely which seems like the physical CPU has gone idle. Is this correct? In that case, *what am I doing wrong in the scheduler* to cause Dom0 to crash?
  • The hang occurs around 3-5 seconds into the boot process quite deterministically. Could it be some periodic timer going off and bugging with my code in weird and wonderful ways?
Also, how do the sleep/wake/yield callbacks work? When do they get called? Is there any documentation on the different callbacks with regards to when they are called? If I understand everything correctly after this, I would gladly create a wiki page explaining this (and perhaps a tutorial on writing a simple scheduler; something I wish existed!).

I hope the description was enough to help understand my problem. If not, feel free to ask for more details :-)

Thanks for reading this far! Source code follows

--
/mvanga


---------- SOURCE CODE BEGINS ----------
/****************************************************************************
Â* (C) 2013 - Manohar Vanga - MPI-SWS
Â****************************************************************************
Â*
Â* Â Â Â ÂFile: common/sched_xfair.c
Â* Â Â ÂAuthor: Manohar Vanga
Â*
Â* Description: Table driven scheduler for Xen
Â*/

#include <xen/config.h>
#include <xen/init.h>
#include <xen/lib.h>
#include <xen/sched.h>
#include <xen/domain.h>
#include <xen/delay.h>
#include <xen/event.h>
#include <xen/time.h>
#include <xen/sched-if.h>
#include <xen/softirq.h>
#include <asm/atomic.h>
#include <asm/div64.h>
#include <xen/errno.h>
#include <xen/keyhandler.h>
#include <xen/trace.h>
#include <xen/list.h>

/* Default timeslice: 30ms */
#define XFAIR_DEFAULT_TSLICE_MS Â Â30

/* Some useful macros */
/* Get the private data from a set of ops */
#define XFAIR_PRIV(_ops) Â \
  ((struct xfair_private *)((_ops)->sched_data))
/* Get the PCPU structure for a given CPU number */
#define XFAIR_PCPU(_c) Â Â \
  ((struct xfair_pcpu *)per_cpu(schedule_data, _c).sched_priv)
/* Get the XFair VCPU structure for a given Xen VCPU */
#define XFAIR_VCPU(_vcpu) Â((struct xfair_vcpu *) (_vcpu)->sched_priv)
/* Get the XFair dom structure for a given Xen dom */
#define XFAIR_DOM(_dom) Â Â((struct xfair_dom *) (_dom)->sched_priv)
/* Get the runqueue for a particular CPU */
#define RUNQ(_cpu) Â Â Â Â Â(&(XFAIR_PCPU(_cpu)->runq))
/* Is the first element of _cpu's runq its idle vcpu? */
#define IS_RUNQ_IDLE(_cpu) Â(list_empty(RUNQ(_cpu)) || \
              Âis_idle_vcpu(__runq_elem(RUNQ(_cpu)->next)->vcpu))


/* Xfair tracing events */
#define TRC_XFAIR_SCHED_START Â TRC_SCHED_CLASS_EVT(XFAIR, 1)
#define TRC_XFAIR_SCHED_END Â Â TRC_SCHED_CLASS_EVT(XFAIR, 2)

/* Physical CPU */
struct xfair_pcpu {
  struct list_head runq;
#if 0
  struct timer ticker;
  unsigned int tick;
#endif
};

/* Virtual CPU */
struct xfair_vcpu {
  struct xfair_dom *domain; /* The domain this VCPU belongs to */
  struct vcpu *vcpu; /* The core Xen VCPU structure */
  struct list_head runq_elem; /* List element for adding to runqueue */
};

/* Domain */
struct xfair_dom {
  struct domain *dom; /* The core Xen domain structure */
};

/* System-wide private data */
struct xfair_private {
  spinlock_t lock;
};

static inline int __vcpu_on_runq(struct xfair_vcpu *vcpu)
{
  return !list_empty(&vcpu->runq_elem);
}

static inline struct xfair_vcpu *__runq_elem(struct list_head *elem)
{
  return list_entry(elem, struct xfair_vcpu, runq_elem);
}

static inline void __runq_insert(unsigned int cpu, struct xfair_vcpu *vcpu)
{
  struct list_head *runq = RUNQ(cpu);
 ÂÂ
BUG_ON(__vcpu_on_runq(vcpu));
  BUG_ON(cpu != vcpu->vcpu->processor);

  /* Add back at the end of the list */
  list_add_tail(&vcpu->runq_elem, runq);
}

static inline void
__runq_remove(struct xfair_vcpu *vcpu)
{
BUG_ON(!__vcpu_on_runq(vcpu));
  list_del_init(&vcpu->runq_elem);
}

static inline void print_runq(unsigned int cpu)
{
struct xfair_vcpu *c;
  struct list_head *runq = RUNQ(cpu);
 ÂÂ
debug("RUNQ: ");
list_for_each_entry(c, runq, runq_elem)
debug("(%d.%d) ", c->domain->dom->domain_id, c->vcpu->vcpu_id);
debug("\n");
}

/* Allocate a structure for a physical CPU */
static void *xfair_alloc_pdata(const struct scheduler *ops, int cpu)
{
  struct xfair_pcpu *pcpu;

  debug(KERN_INFO "%s: ", __func__);
  debug("cpu=%d\n", cpu);

  /* Allocate per-PCPU info */
  pcpu = xzalloc(struct xfair_pcpu);
  if (pcpu == NULL)
    return NULL;

  INIT_LIST_HEAD(&pcpu->runq);
/* schedule.c expects this to not be NULL (for some reason) */
  if (per_cpu(schedule_data, cpu).sched_priv == NULL)
    per_cpu(schedule_data, cpu).sched_priv = pcpu;

  BUG_ON(!is_idle_vcpu(curr_on_cpu(cpu)));

  return pcpu;
}

static void xfair_free_pdata(const struct scheduler *ops, void *pc, int cpu)
{
  struct xfair_pcpu *pcpu = pc;

  debug(KERN_INFO "%s: ", __func__);
  debug("cpu=%d\n", cpu);

if (pcpu)
xfree(pcpu);
}

static void *xfair_alloc_vdata(const struct scheduler *ops, struct vcpu *vc,
  void *dd)
{
  struct xfair_vcpu *vcpu;

  /* Allocate per-VCPU info */
  vcpu = xzalloc(struct xfair_vcpu);
  if (vcpu == NULL)
    return NULL;

  INIT_LIST_HEAD(&vcpu->runq_elem);
  vcpu->domain = dd;
  vcpu->vcpu = vc;

  debug(KERN_INFO "%s: ", __func__);
  debug("vcpu=%d\n", vc->vcpu_id);

  return vcpu;
}

static void xfair_free_vdata(const struct scheduler *ops, void *vc)
{
  struct xfair_vcpu *vcpu = vc;

if (!vcpu)
return;

  debug(KERN_INFO "%s: ", __func__);
debug("vcpu=%d\n", vcpu->vcpu->vcpu_id);

  BUG_ON(!list_empty(&vcpu->runq_elem));
  xfree(vcpu);
}

static void xfair_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
{
  struct xfair_vcpu *vcpu = vc->sched_priv;

BUG_ON(!vcpu);
 ÂÂ
  debug(KERN_INFO "%s: ", __func__);
debug("vcpu=%d\n", vcpu->vcpu->vcpu_id);

if (!vc->is_running && vcpu_runnable(vc) && !__vcpu_on_runq(vcpu))
__runq_insert(vc->processor, vcpu);
}

static void xfair_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
{
  struct xfair_vcpu * const vcpu = XFAIR_VCPU(vc);
  struct xfair_dom * const dom = vcpu->domain;

BUG_ON(!vcpu);

  debug(KERN_INFO "%s: ", __func__);
debug("vcpu=%d\n", vcpu->vcpu->vcpu_id);

  if (__vcpu_on_runq(vcpu))
    __runq_remove(vcpu);

  BUG_ON(dom == NULL);
  BUG_ON(!list_empty(&vcpu->runq_elem));
}

static void xfair_sleep(const struct scheduler *ops, struct vcpu *vc)
{
struct xfair_vcpu * const vcpu = XFAIR_VCPU(vc);

  debug(KERN_INFO "%s: ", __func__);
debug("dom=%d, vcpu=%d\n", vcpu->domain->dom->domain_id, vcpu->vcpu->vcpu_id);

  BUG_ON(is_idle_vcpu(vc));
BUG_ON(vcpu_runnable(vc));

/* If the vcpu is the current VCPU on the processor, it is guaranteed to
* not be on the runqueue of that processor.
* However, now we need to make sure that if it wasn't the current task
* and was instead waiting in the runqueue, it should be removed
*/
  if (curr_on_cpu(vc->processor) == vc)
cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
else if (__vcpu_on_runq(vcpu))
__runq_remove(vcpu);
}

static void xfair_wake(const struct scheduler *ops, struct vcpu *vc)
{
  struct xfair_vcpu * const vcpu = XFAIR_VCPU(vc);

  debug(KERN_INFO "%s: ", __func__);
debug("dom=%d, vcpu=%d\n", vcpu->domain->dom->domain_id, vcpu->vcpu->vcpu_id);
BUG_ON(is_idle_vcpu(vc));
  if (unlikely(curr_on_cpu(vc->processor) == vc)) {
debug("woke vcpu=%d that is currently running on cpu=%d\n", vc->vcpu_id,
vc->processor);
    return;
}

  if (unlikely(__vcpu_on_runq(vcpu))) {
debug("vcpu=%d is already on runqueue of cpu=%d\n", vc->vcpu_id,
vc->processor);
    return;
}
if (!__vcpu_on_runq(vcpu) && vcpu_runnable(vc) && !vc->is_running )
__runq_insert(vc->processor, vcpu);

cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
}

static void xfair_yield(const struct scheduler *ops, struct vcpu *vc)
{
#ifdef RTS_CONFIG_DEBUG
struct xfair_vcpu * const vcpu = XFAIR_VCPU(vc);
#endif

  debug(KERN_INFO "%s: ", __func__);
debug("dom=%d, vcpu=%d\n", vcpu->domain->dom->domain_id, vcpu->vcpu->vcpu_id);

static void *xfair_alloc_domdata(const struct scheduler *ops, struct domain *d)
{
  struct xfair_dom *dom;

  debug(KERN_INFO "%s: ", __func__);
  debug("dom=%d\n", d->domain_id);

  dom = xzalloc(struct xfair_dom);
  if (dom == NULL)
    return NULL;

  dom->dom = d;

  return (void *)dom;
}

static void xfair_free_domdata(const struct scheduler *ops, void *d)
{
#ifdef RTS_CONFIG_DEBUG
  struct xfair_dom *dom = d;
#endif

  debug(KERN_INFO "%s: ", __func__);
  debug("dom=%d\n", dom->dom->domain_id);

  xfree(d);
}

static int xfair_dom_init(const struct scheduler *ops, struct domain *d)
{
  struct xfair_dom *dom;

  if (is_idle_domain(d))
    return 0;

  dom = xfair_alloc_domdata(ops, d);
  if (dom == NULL)
    return -ENOMEM;

  d->sched_priv = dom;

  debug(KERN_INFO "%s: ", __func__);
  debug("dom=%d\n", d->domain_id);

  return 0;
}

static void
xfair_dom_destroy(const struct scheduler *ops, struct domain *d)
{
  debug(KERN_INFO "%s: ", __func__);
  debug("dom=%d\n", d->domain_id);

  xfair_free_domdata(ops, XFAIR_DOM(d));
}

static int xfair_pick_cpu(const struct scheduler *ops, struct vcpu *v)
{
  debug(KERN_INFO "%s: ", __func__);
  debug("vcpu=%d, pcpu picked=%d\n", v->vcpu_id, v->processor);
return v->processor;
}

/*
Â* This function is in the critical path. It is designed to be simple and
Â* fast for the common case.
Â*/
static struct task_slice
xfair_schedule(
  const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled)
{
  const int cpu = smp_processor_id();
  struct list_head * const runq = RUNQ(cpu);
  struct xfair_vcpu * const scurr = XFAIR_VCPU(current);
  struct xfair_vcpu *snext;
  struct task_slice ret;
  s_time_t tslice = MILLISECS(30);
/* Add this VCPU back into the runqueue */
  if (!__vcpu_on_runq(scurr) && vcpu_runnable(current)
&& !is_idle_vcpu(current))
__runq_insert(cpu, scurr);

print_runq(cpu);

/* Tasklet work (which runs in idle VCPU context) overrides all else. */
  if (tasklet_work_scheduled) {
  debug(KERN_INFO "%s: ", __func__);
  debug("tasklet work scheduled. idling.\n");
snext = XFAIR_VCPU(idle_vcpu[cpu]);
} else {
/* Select next runnable local VCPU (ie top of local runq) */
if (!list_empty(runq)) {
snext = __runq_elem(runq->next);
if (__vcpu_on_runq(snext))
__runq_remove(snext);
} else {
snext = XFAIR_VCPU(idle_vcpu[cpu]);
}
}

print_runq(cpu);

/* Initialize, check and return task to run next */
  ret.task = snext->vcpu;
  ret.time = (is_idle_vcpu(snext->vcpu) ? -1 : tslice);
  ret.migrated = 0;

if (snext && snext->vcpu != current) {
  debug(KERN_INFO "%s: ", __func__);
if (!is_idle_vcpu(snext->vcpu))
  debug("CPU %d picked(dom.vcpu)=%d.%d\n", cpu, snext->domain->dom->domain_id, snext->vcpu->vcpu_id);
else
  debug("CPU %d picked(dom.vcpu)=idle.%d\n", cpu, snext->vcpu->vcpu_id);
}

  return ret;
}

static int
xfair_init(struct scheduler *ops)
{
  struct xfair_private *priv;

  priv = xzalloc(struct xfair_private);
  if (priv == NULL)
    return -ENOMEM;

  ops->sched_data = priv;
  spin_lock_init(&priv->lock);
debugtrace_toggle();

  return 0;
}

static void
xfair_deinit(const struct scheduler *ops)
{
  struct xfair_private *priv;

  priv = XFAIR_PRIV(ops);
  if (priv)
    xfree(priv);
}

static struct xfair_private _xfair_priv;

const struct scheduler sched_xfair_def = {
  .name      = "XFair Table Driver Scheduler",
  .opt_name    = "xfair",
  .sched_id    = XEN_SCHEDULER_XFAIR,
  .sched_data   = &_xfair_priv,

  .init_domain  Â= xfair_dom_init,
  .destroy_domain = xfair_dom_destroy,

  .insert_vcpu  Â= xfair_vcpu_insert,
  .remove_vcpu  Â= xfair_vcpu_remove,

 .sleep = xfair_sleep,
 .wake = xfair_wake,
 .yield = xfair_yield,

  .pick_cpu    = xfair_pick_cpu,
  .do_schedule  Â= xfair_schedule,
  .init      = xfair_init,
  .deinit     = xfair_deinit,

  .alloc_vdata  Â= xfair_alloc_vdata,
  .free_vdata   = xfair_free_vdata,
  .alloc_pdata  Â= xfair_alloc_pdata,
  .free_pdata   = xfair_free_pdata,
  .alloc_domdata Â= xfair_alloc_domdata,
  .free_domdata  = xfair_free_domdata,
};

---------- SOURCE CODE ENDS ----------

Attachment: sched_xfair.c
Description: Text Data

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.