[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 04 of 10 v2] xen: allow for explicitly specifying node-affinity



Make it possible to pass the node-affinity of a domain to the hypervisor
from the upper layers, instead of always being computed automatically.

Note that this also required generalizing the Flask hooks for setting
and getting the affinity, so that they now deal with both vcpu and
node affinity.

Signed-off-by: Dario Faggioli <dario.faggioli@xxxxxxxxxx>
---
Changes from v1:
 * added the missing dummy hook for nodeaffinity;
 * let the permission renaming affect flask policies too.

diff --git a/tools/flask/policy/policy/flask/access_vectors 
b/tools/flask/policy/policy/flask/access_vectors
--- a/tools/flask/policy/policy/flask/access_vectors
+++ b/tools/flask/policy/policy/flask/access_vectors
@@ -47,8 +47,8 @@ class domain
     transition
     max_vcpus
     destroy
-    setvcpuaffinity
-       getvcpuaffinity
+    setaffinity
+       getaffinity
        scheduler
        getdomaininfo
        getvcpuinfo
diff --git a/tools/flask/policy/policy/mls b/tools/flask/policy/policy/mls
--- a/tools/flask/policy/policy/mls
+++ b/tools/flask/policy/policy/mls
@@ -70,11 +70,11 @@ mlsconstrain domain transition
        (( h1 dom h2 ) and (( l1 eq l2 ) or (t1 == mls_priv)));
 
 # all the domain "read" ops
-mlsconstrain domain { getvcpuaffinity getdomaininfo getvcpuinfo getvcpucontext 
getaddrsize getextvcpucontext }
+mlsconstrain domain { getaffinity getdomaininfo getvcpuinfo getvcpucontext 
getaddrsize getextvcpucontext }
        ((l1 dom l2) or (t1 == mls_priv));
 
 # all the domain "write" ops
-mlsconstrain domain { setvcpucontext pause unpause resume create max_vcpus 
destroy setvcpuaffinity scheduler setdomainmaxmem setdomainhandle setdebugging 
hypercall settime set_target shutdown setaddrsize trigger setextvcpucontext }
+mlsconstrain domain { setvcpucontext pause unpause resume create max_vcpus 
destroy setaffinity scheduler setdomainmaxmem setdomainhandle setdebugging 
hypercall settime set_target shutdown setaddrsize trigger setextvcpucontext }
        ((l1 eq l2) or (t1 == mls_priv));
 
 # This is incomplete - similar constraints must be written for all classes
diff --git a/tools/flask/policy/policy/modules/xen/xen.if 
b/tools/flask/policy/policy/modules/xen/xen.if
--- a/tools/flask/policy/policy/modules/xen/xen.if
+++ b/tools/flask/policy/policy/modules/xen/xen.if
@@ -55,9 +55,9 @@ define(`create_domain_build_label', `
 # manage_domain(priv, target)
 #   Allow managing a running domain
 define(`manage_domain', `
-       allow $1 $2:domain { getdomaininfo getvcpuinfo getvcpuaffinity
+       allow $1 $2:domain { getdomaininfo getvcpuinfo getaffinity
                        getaddrsize pause unpause trigger shutdown destroy
-                       setvcpuaffinity setdomainmaxmem };
+                       setaffinity setdomainmaxmem };
 ')
 
 # migrate_domain_out(priv, target)
diff --git a/xen/common/domain.c b/xen/common/domain.c
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -222,6 +222,7 @@ struct domain *domain_create(
 
     spin_lock_init(&d->node_affinity_lock);
     d->node_affinity = NODE_MASK_ALL;
+    d->auto_node_affinity = 1;
 
     spin_lock_init(&d->shutdown_lock);
     d->shutdown_code = -1;
@@ -362,11 +363,26 @@ void domain_update_node_affinity(struct 
         cpumask_or(cpumask, cpumask, online_affinity);
     }
 
-    for_each_online_node ( node )
-        if ( cpumask_intersects(&node_to_cpumask(node), cpumask) )
-            node_set(node, nodemask);
+    if ( d->auto_node_affinity )
+    {
+        /* Node-affinity is automaically computed from all vcpu-affinities */
+        for_each_online_node ( node )
+            if ( cpumask_intersects(&node_to_cpumask(node), cpumask) )
+                node_set(node, nodemask);
 
-    d->node_affinity = nodemask;
+        d->node_affinity = nodemask;
+    }
+    else
+    {
+        /* Node-affinity is provided by someone else, just filter out cpus
+         * that are either offline or not in the affinity of any vcpus. */
+        for_each_node_mask ( node, d->node_affinity )
+            if ( !cpumask_intersects(&node_to_cpumask(node), cpumask) )
+                node_clear(node, d->node_affinity);
+    }
+
+    sched_set_node_affinity(d, &d->node_affinity);
+
     spin_unlock(&d->node_affinity_lock);
 
     free_cpumask_var(online_affinity);
@@ -374,6 +390,36 @@ void domain_update_node_affinity(struct 
 }
 
 
+int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity)
+{
+    /* Being affine with no nodes is just wrong */
+    if ( nodes_empty(*affinity) )
+        return -EINVAL;
+
+    spin_lock(&d->node_affinity_lock);
+
+    /*
+     * Being/becoming explicitly affine to all nodes is not particularly
+     * useful. Let's take it as the `reset node affinity` command.
+     */
+    if ( nodes_full(*affinity) )
+    {
+        d->auto_node_affinity = 1;
+        goto out;
+    }
+
+    d->auto_node_affinity = 0;
+    d->node_affinity = *affinity;
+
+out:
+    spin_unlock(&d->node_affinity_lock);
+
+    domain_update_node_affinity(d);
+
+    return 0;
+}
+
+
 struct domain *get_domain_by_id(domid_t dom)
 {
     struct domain *d;
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -609,6 +609,40 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe
     }
     break;
 
+    case XEN_DOMCTL_setnodeaffinity:
+    case XEN_DOMCTL_getnodeaffinity:
+    {
+        domid_t dom = op->domain;
+        struct domain *d = rcu_lock_domain_by_id(dom);
+
+        ret = -ESRCH;
+        if ( d == NULL )
+            break;
+
+        ret = xsm_nodeaffinity(op->cmd, d);
+        if ( ret )
+            goto nodeaffinity_out;
+
+        if ( op->cmd == XEN_DOMCTL_setnodeaffinity )
+        {
+            nodemask_t new_affinity;
+
+            ret = xenctl_bitmap_to_nodemask(&new_affinity,
+                                            &op->u.nodeaffinity.nodemap);
+            if ( !ret )
+                ret = domain_set_node_affinity(d, &new_affinity);
+        }
+        else
+        {
+            ret = nodemask_to_xenctl_bitmap(&op->u.nodeaffinity.nodemap,
+                                            &d->node_affinity);
+        }
+
+    nodeaffinity_out:
+        rcu_unlock_domain(d);
+    }
+    break;
+
     case XEN_DOMCTL_setvcpuaffinity:
     case XEN_DOMCTL_getvcpuaffinity:
     {
diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c
--- a/xen/common/keyhandler.c
+++ b/xen/common/keyhandler.c
@@ -217,6 +217,14 @@ static void cpuset_print(char *set, int 
     *set++ = '\0';
 }
 
+static void nodeset_print(char *set, int size, const nodemask_t *mask)
+{
+    *set++ = '[';
+    set += nodelist_scnprintf(set, size-2, mask);
+    *set++ = ']';
+    *set++ = '\0';
+}
+
 static void periodic_timer_print(char *str, int size, uint64_t period)
 {
     if ( period == 0 )
@@ -272,6 +280,9 @@ static void dump_domains(unsigned char k
 
         dump_pageframe_info(d);
                
+        nodeset_print(tmpstr, sizeof(tmpstr), &d->node_affinity);
+        printk("NODE affinity for domain %d: %s\n", d->domain_id, tmpstr);
+
         printk("VCPU information and callbacks for domain %u:\n",
                d->domain_id);
         for_each_vcpu ( d, v )
diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -269,6 +269,33 @@ static inline void
     list_del_init(&svc->runq_elem);
 }
 
+/*
+ * Translates node-affinity mask into a cpumask, so that we can use it during
+ * actual scheduling. That of course will contain all the cpus from all the
+ * set nodes in the original node-affinity mask.
+ *
+ * Note that any serialization needed to access mask safely is complete
+ * responsibility of the caller of this function/hook.
+ */
+static void csched_set_node_affinity(
+    const struct scheduler *ops,
+    struct domain *d,
+    nodemask_t *mask)
+{
+    struct csched_dom *sdom;
+    int node;
+
+    /* Skip idle domain since it doesn't even have a node_affinity_cpumask */
+    if ( unlikely(is_idle_domain(d)) )
+        return;
+
+    sdom = CSCHED_DOM(d);
+    cpumask_clear(sdom->node_affinity_cpumask);
+    for_each_node_mask( node, *mask )
+        cpumask_or(sdom->node_affinity_cpumask, sdom->node_affinity_cpumask,
+                   &node_to_cpumask(node));
+}
+
 #define for_each_csched_balance_step(__step) \
     for ( (__step) = CSCHED_BALANCE_LAST; (__step) >= 0; (__step)-- )
 
@@ -296,7 +323,8 @@ csched_balance_cpumask(const struct vcpu
 
         cpumask_and(mask, sdom->node_affinity_cpumask, vc->cpu_affinity);
 
-        if ( cpumask_full(sdom->node_affinity_cpumask) )
+        if ( cpumask_full(sdom->node_affinity_cpumask) ||
+             d->auto_node_affinity == 1 )
             return -1;
     }
     else /* step == CSCHED_BALANCE_CPU_AFFINITY */
@@ -1896,6 +1924,8 @@ const struct scheduler sched_credit_def 
     .adjust         = csched_dom_cntl,
     .adjust_global  = csched_sys_cntl,
 
+    .set_node_affinity  = csched_set_node_affinity,
+
     .pick_cpu       = csched_cpu_pick,
     .do_schedule    = csched_schedule,
 
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -590,6 +590,11 @@ int cpu_disable_scheduler(unsigned int c
     return ret;
 }
 
+void sched_set_node_affinity(struct domain *d, nodemask_t *mask)
+{
+    SCHED_OP(DOM2OP(d), set_node_affinity, d, mask);
+}
+
 int vcpu_set_affinity(struct vcpu *v, const cpumask_t *affinity)
 {
     cpumask_t online_affinity;
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -279,6 +279,16 @@ typedef struct xen_domctl_getvcpuinfo xe
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvcpuinfo_t);
 
 
+/* Get/set the NUMA node(s) with which the guest has affinity with. */
+/* XEN_DOMCTL_setnodeaffinity */
+/* XEN_DOMCTL_getnodeaffinity */
+struct xen_domctl_nodeaffinity {
+    struct xenctl_bitmap nodemap;/* IN */
+};
+typedef struct xen_domctl_nodeaffinity xen_domctl_nodeaffinity_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_nodeaffinity_t);
+
+
 /* Get/set which physical cpus a vcpu can execute on. */
 /* XEN_DOMCTL_setvcpuaffinity */
 /* XEN_DOMCTL_getvcpuaffinity */
@@ -907,6 +917,8 @@ struct xen_domctl {
 #define XEN_DOMCTL_audit_p2m                     65
 #define XEN_DOMCTL_set_virq_handler              66
 #define XEN_DOMCTL_set_broken_page_p2m           67
+#define XEN_DOMCTL_setnodeaffinity               68
+#define XEN_DOMCTL_getnodeaffinity               69
 #define XEN_DOMCTL_gdbsx_guestmemio            1000
 #define XEN_DOMCTL_gdbsx_pausevcpu             1001
 #define XEN_DOMCTL_gdbsx_unpausevcpu           1002
@@ -920,6 +932,7 @@ struct xen_domctl {
         struct xen_domctl_getpageframeinfo  getpageframeinfo;
         struct xen_domctl_getpageframeinfo2 getpageframeinfo2;
         struct xen_domctl_getpageframeinfo3 getpageframeinfo3;
+        struct xen_domctl_nodeaffinity      nodeaffinity;
         struct xen_domctl_vcpuaffinity      vcpuaffinity;
         struct xen_domctl_shadow_op         shadow_op;
         struct xen_domctl_max_mem           max_mem;
diff --git a/xen/include/xen/nodemask.h b/xen/include/xen/nodemask.h
--- a/xen/include/xen/nodemask.h
+++ b/xen/include/xen/nodemask.h
@@ -8,8 +8,9 @@
  * See detailed comments in the file linux/bitmap.h describing the
  * data type on which these nodemasks are based.
  *
- * For details of nodemask_scnprintf() and nodemask_parse(),
- * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c.
+ * For details of nodemask_scnprintf(), nodelist_scnpintf() and
+ * nodemask_parse(), see bitmap_scnprintf() and bitmap_parse()
+ * in lib/bitmap.c.
  *
  * The available nodemask operations are:
  *
@@ -50,6 +51,7 @@
  * unsigned long *nodes_addr(mask)     Array of unsigned long's in mask
  *
  * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing
+ * int nodelist_scnprintf(buf, len, mask) Format nodemask as a list for 
printing
  * int nodemask_parse(ubuf, ulen, mask)        Parse ascii string as nodemask
  *
  * for_each_node_mask(node, mask)      for-loop node over mask
@@ -292,6 +294,14 @@ static inline int __cycle_node(int n, co
 
 #define nodes_addr(src) ((src).bits)
 
+#define nodelist_scnprintf(buf, len, src) \
+                       __nodelist_scnprintf((buf), (len), (src), MAX_NUMNODES)
+static inline int __nodelist_scnprintf(char *buf, int len,
+                                       const nodemask_t *srcp, int nbits)
+{
+       return bitmap_scnlistprintf(buf, len, srcp->bits, nbits);
+}
+
 #if 0
 #define nodemask_scnprintf(buf, len, src) \
                        __nodemask_scnprintf((buf), (len), &(src), MAX_NUMNODES)
diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h
--- a/xen/include/xen/sched-if.h
+++ b/xen/include/xen/sched-if.h
@@ -184,6 +184,8 @@ struct scheduler {
                                     struct xen_domctl_scheduler_op *);
     int          (*adjust_global)  (const struct scheduler *,
                                     struct xen_sysctl_scheduler_op *);
+    void         (*set_node_affinity) (const struct scheduler *,
+                                       struct domain *, nodemask_t *);
     void         (*dump_settings)  (const struct scheduler *);
     void         (*dump_cpu_state) (const struct scheduler *, int);
 
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -359,8 +359,12 @@ struct domain
     /* Various mem_events */
     struct mem_event_per_domain *mem_event;
 
-    /* Currently computed from union of all vcpu cpu-affinity masks. */
+    /*
+     * Can be specified by the user. If that is not the case, it is
+     * computed from the union of all the vcpu cpu-affinity masks.
+     */
     nodemask_t node_affinity;
+    int auto_node_affinity;
     unsigned int last_alloc_node;
     spinlock_t node_affinity_lock;
 };
@@ -429,6 +433,7 @@ static inline void get_knownalive_domain
     ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED));
 }
 
+int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity);
 void domain_update_node_affinity(struct domain *d);
 
 struct domain *domain_create(
@@ -543,6 +548,7 @@ void sched_destroy_domain(struct domain 
 int sched_move_domain(struct domain *d, struct cpupool *c);
 long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *);
 long sched_adjust_global(struct xen_sysctl_scheduler_op *);
+void sched_set_node_affinity(struct domain *, nodemask_t *);
 int  sched_id(void);
 void sched_tick_suspend(void);
 void sched_tick_resume(void);
diff --git a/xen/include/xsm/xsm.h b/xen/include/xsm/xsm.h
--- a/xen/include/xsm/xsm.h
+++ b/xen/include/xsm/xsm.h
@@ -56,6 +56,7 @@ struct xsm_operations {
     int (*domain_create) (struct domain *d, u32 ssidref);
     int (*max_vcpus) (struct domain *d);
     int (*destroydomain) (struct domain *d);
+    int (*nodeaffinity) (int cmd, struct domain *d);
     int (*vcpuaffinity) (int cmd, struct domain *d);
     int (*scheduler) (struct domain *d);
     int (*getdomaininfo) (struct domain *d);
@@ -229,6 +230,11 @@ static inline int xsm_destroydomain (str
     return xsm_call(destroydomain(d));
 }
 
+static inline int xsm_nodeaffinity (int cmd, struct domain *d)
+{
+    return xsm_call(nodeaffinity(cmd, d));
+}
+
 static inline int xsm_vcpuaffinity (int cmd, struct domain *d)
 {
     return xsm_call(vcpuaffinity(cmd, d));
diff --git a/xen/xsm/dummy.c b/xen/xsm/dummy.c
--- a/xen/xsm/dummy.c
+++ b/xen/xsm/dummy.c
@@ -54,6 +54,11 @@ static int dummy_destroydomain (struct d
     return 0;
 }
 
+static int dummy_nodeaffinity (int cmd, struct domain *d)
+{
+    return 0;
+}
+
 static int dummy_vcpuaffinity (int cmd, struct domain *d)
 {
     return 0;
@@ -634,6 +639,7 @@ void xsm_fixup_ops (struct xsm_operation
     set_to_dummy_if_null(ops, domain_create);
     set_to_dummy_if_null(ops, max_vcpus);
     set_to_dummy_if_null(ops, destroydomain);
+    set_to_dummy_if_null(ops, nodeaffinity);
     set_to_dummy_if_null(ops, vcpuaffinity);
     set_to_dummy_if_null(ops, scheduler);
     set_to_dummy_if_null(ops, getdomaininfo);
diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c
--- a/xen/xsm/flask/hooks.c
+++ b/xen/xsm/flask/hooks.c
@@ -521,17 +521,19 @@ static int flask_destroydomain(struct do
                            DOMAIN__DESTROY);
 }
 
-static int flask_vcpuaffinity(int cmd, struct domain *d)
+static int flask_affinity(int cmd, struct domain *d)
 {
     u32 perm;
 
     switch ( cmd )
     {
     case XEN_DOMCTL_setvcpuaffinity:
-        perm = DOMAIN__SETVCPUAFFINITY;
+    case XEN_DOMCTL_setnodeaffinity:
+        perm = DOMAIN__SETAFFINITY;
         break;
     case XEN_DOMCTL_getvcpuaffinity:
-        perm = DOMAIN__GETVCPUAFFINITY;
+    case XEN_DOMCTL_getnodeaffinity:
+        perm = DOMAIN__GETAFFINITY;
         break;
     default:
         return -EPERM;
@@ -1473,7 +1475,8 @@ static struct xsm_operations flask_ops =
     .domain_create = flask_domain_create,
     .max_vcpus = flask_max_vcpus,
     .destroydomain = flask_destroydomain,
-    .vcpuaffinity = flask_vcpuaffinity,
+    .nodeaffinity = flask_affinity,
+    .vcpuaffinity = flask_affinity,
     .scheduler = flask_scheduler,
     .getdomaininfo = flask_getdomaininfo,
     .getvcpucontext = flask_getvcpucontext,
diff --git a/xen/xsm/flask/include/av_perm_to_string.h 
b/xen/xsm/flask/include/av_perm_to_string.h
--- a/xen/xsm/flask/include/av_perm_to_string.h
+++ b/xen/xsm/flask/include/av_perm_to_string.h
@@ -37,8 +37,8 @@
    S_(SECCLASS_DOMAIN, DOMAIN__TRANSITION, "transition")
    S_(SECCLASS_DOMAIN, DOMAIN__MAX_VCPUS, "max_vcpus")
    S_(SECCLASS_DOMAIN, DOMAIN__DESTROY, "destroy")
-   S_(SECCLASS_DOMAIN, DOMAIN__SETVCPUAFFINITY, "setvcpuaffinity")
-   S_(SECCLASS_DOMAIN, DOMAIN__GETVCPUAFFINITY, "getvcpuaffinity")
+   S_(SECCLASS_DOMAIN, DOMAIN__SETAFFINITY, "setaffinity")
+   S_(SECCLASS_DOMAIN, DOMAIN__GETAFFINITY, "getaffinity")
    S_(SECCLASS_DOMAIN, DOMAIN__SCHEDULER, "scheduler")
    S_(SECCLASS_DOMAIN, DOMAIN__GETDOMAININFO, "getdomaininfo")
    S_(SECCLASS_DOMAIN, DOMAIN__GETVCPUINFO, "getvcpuinfo")
diff --git a/xen/xsm/flask/include/av_permissions.h 
b/xen/xsm/flask/include/av_permissions.h
--- a/xen/xsm/flask/include/av_permissions.h
+++ b/xen/xsm/flask/include/av_permissions.h
@@ -38,8 +38,8 @@
 #define DOMAIN__TRANSITION                        0x00000020UL
 #define DOMAIN__MAX_VCPUS                         0x00000040UL
 #define DOMAIN__DESTROY                           0x00000080UL
-#define DOMAIN__SETVCPUAFFINITY                   0x00000100UL
-#define DOMAIN__GETVCPUAFFINITY                   0x00000200UL
+#define DOMAIN__SETAFFINITY                       0x00000100UL
+#define DOMAIN__GETAFFINITY                       0x00000200UL
 #define DOMAIN__SCHEDULER                         0x00000400UL
 #define DOMAIN__GETDOMAININFO                     0x00000800UL
 #define DOMAIN__GETVCPUINFO                       0x00001000UL

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.