x86/x2apic: properly implement cluster mode

So far, cluster mode was just an alternative implementation of
physical mode: Allowing only single CPU interrupt targets, and sending
IPIs to each target CPU separately. Take advantage of what cluster
mode really can do in that regard.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>

--- a/xen/arch/x86/genapic/x2apic.c
+++ b/xen/arch/x86/genapic/x2apic.c
@@ -19,6 +19,7 @@
 
 #include <xen/config.h>
 #include <xen/init.h>
+#include <xen/cpu.h>
 #include <xen/cpumask.h>
 #include <asm/apicdef.h>
 #include <asm/genapic.h>
@@ -33,6 +34,14 @@ static bool_t __initdata x2apic_phys; /*
 boolean_param("x2apic_phys", x2apic_phys);
 
 static DEFINE_PER_CPU_READ_MOSTLY(u32, cpu_2_logical_apicid);
+static DEFINE_PER_CPU_READ_MOSTLY(cpumask_t *, cluster_cpus);
+static cpumask_t *cluster_cpus_spare;
+static DEFINE_PER_CPU(cpumask_var_t, scratch_mask);
+
+static inline u32 x2apic_cluster(unsigned int cpu)
+{
+    return per_cpu(cpu_2_logical_apicid, cpu) >> 16;
+}
 
 static void init_apic_ldr_x2apic_phys(void)
 {
@@ -40,20 +49,53 @@ static void init_apic_ldr_x2apic_phys(vo
 
 static void init_apic_ldr_x2apic_cluster(void)
 {
-    this_cpu(cpu_2_logical_apicid) = apic_read(APIC_LDR);
+    unsigned int cpu, this_cpu = smp_processor_id();
+
+    per_cpu(cpu_2_logical_apicid, this_cpu) = apic_read(APIC_LDR);
+
+    if ( per_cpu(cluster_cpus, this_cpu) )
+    {
+        ASSERT(cpumask_test_cpu(this_cpu, per_cpu(cluster_cpus, this_cpu)));
+        return;
+    }
+
+    per_cpu(cluster_cpus, this_cpu) = cluster_cpus_spare;
+    for_each_online_cpu ( cpu )
+    {
+        if (this_cpu == cpu || x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+            continue;
+        per_cpu(cluster_cpus, this_cpu) = per_cpu(cluster_cpus, cpu);
+        break;
+    }
+    if ( per_cpu(cluster_cpus, this_cpu) == cluster_cpus_spare )
+        cluster_cpus_spare = NULL;
+
+    cpumask_set_cpu(this_cpu, per_cpu(cluster_cpus, this_cpu));
 }
 
 static void __init clustered_apic_check_x2apic(void)
 {
 }
 
+static const cpumask_t *vector_allocation_cpumask_x2apic_cluster(int cpu)
+{
+    return per_cpu(cluster_cpus, cpu);
+}
+
 static unsigned int cpu_mask_to_apicid_x2apic_cluster(const cpumask_t *cpumask)
 {
-    return per_cpu(cpu_2_logical_apicid, cpumask_first(cpumask));
+    unsigned int cpu = cpumask_first(cpumask);
+    unsigned int dest = per_cpu(cpu_2_logical_apicid, cpu);
+    const cpumask_t *cluster_cpus = per_cpu(cluster_cpus, cpu);
+
+    for_each_cpu ( cpu, cluster_cpus )
+        if ( cpumask_test_cpu(cpu, cpumask) )
+            dest |= per_cpu(cpu_2_logical_apicid, cpu);
+
+    return dest;
 }
 
-static void __send_IPI_mask_x2apic(
-    const cpumask_t *cpumask, int vector, unsigned int dest_mode)
+static void send_IPI_mask_x2apic_phys(const cpumask_t *cpumask, int vector)
 {
     unsigned int cpu;
     unsigned long flags;
@@ -77,23 +119,48 @@ static void __send_IPI_mask_x2apic(
     {
         if ( !cpu_online(cpu) || (cpu == smp_processor_id()) )
             continue;
-        msr_content = (dest_mode == APIC_DEST_PHYSICAL)
-            ? cpu_physical_id(cpu) : per_cpu(cpu_2_logical_apicid, cpu);
-        msr_content = (msr_content << 32) | APIC_DM_FIXED | dest_mode | vector;
+        msr_content = cpu_physical_id(cpu);
+        msr_content = (msr_content << 32) | APIC_DM_FIXED |
+                      APIC_DEST_PHYSICAL | vector;
         apic_wrmsr(APIC_ICR, msr_content);
     }
 
     local_irq_restore(flags);
 }
 
-static void send_IPI_mask_x2apic_phys(const cpumask_t *cpumask, int vector)
-{
-    __send_IPI_mask_x2apic(cpumask, vector, APIC_DEST_PHYSICAL);
-}
-
 static void send_IPI_mask_x2apic_cluster(const cpumask_t *cpumask, int vector)
 {
-    __send_IPI_mask_x2apic(cpumask, vector, APIC_DEST_LOGICAL);
+    unsigned int cpu = smp_processor_id();
+    cpumask_t *ipimask = per_cpu(scratch_mask, cpu);
+    const cpumask_t *cluster_cpus;
+    unsigned long flags;
+
+    mb(); /* See above for an explanation. */
+
+    local_irq_save(flags);
+
+    cpumask_andnot(ipimask, &cpu_online_map, cpumask_of(cpu));
+
+    for ( cpumask_and(ipimask, cpumask, ipimask); !cpumask_empty(ipimask);
+          cpumask_andnot(ipimask, ipimask, cluster_cpus) )
+    {
+        uint64_t msr_content = 0;
+
+        cluster_cpus = per_cpu(cluster_cpus, cpumask_first(ipimask));
+        for_each_cpu ( cpu, cluster_cpus )
+        {
+            if ( !cpumask_test_cpu(cpu, ipimask) )
+                continue;
+            msr_content |= per_cpu(cpu_2_logical_apicid, cpu);
+        }
+
+        BUG_ON(!msr_content);
+        msr_content = (msr_content << 32) | APIC_DM_FIXED |
+                      APIC_DEST_LOGICAL | vector;
+        apic_wrmsr(APIC_ICR, msr_content);
+    }
+
+    local_irq_restore(flags);
 }
 
 static const struct genapic apic_x2apic_phys = {
@@ -116,15 +183,60 @@ static const struct genapic apic_x2apic_
     .init_apic_ldr = init_apic_ldr_x2apic_cluster,
     .clustered_apic_check = clustered_apic_check_x2apic,
     .target_cpus = target_cpus_all,
-    .vector_allocation_cpumask = vector_allocation_cpumask_phys,
+    .vector_allocation_cpumask = vector_allocation_cpumask_x2apic_cluster,
     .cpu_mask_to_apicid = cpu_mask_to_apicid_x2apic_cluster,
     .send_IPI_mask = send_IPI_mask_x2apic_cluster,
     .send_IPI_self = send_IPI_self_x2apic
 };
 
+static int update_clusterinfo(
+    struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+    unsigned int cpu = (unsigned long)hcpu;
+    int err = 0;
+
+    switch (action) {
+    case CPU_UP_PREPARE:
+        per_cpu(cpu_2_logical_apicid, cpu) = BAD_APICID;
+        if ( !cluster_cpus_spare )
+            cluster_cpus_spare = xzalloc(cpumask_t);
+        if ( !cluster_cpus_spare ||
+             !alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) )
+            err = -ENOMEM;
+        break;
+    case CPU_UP_CANCELED:
+    case CPU_DEAD:
+        if ( per_cpu(cluster_cpus, cpu) )
+        {
+            cpumask_clear_cpu(cpu, per_cpu(cluster_cpus, cpu));
+            if ( cpumask_empty(per_cpu(cluster_cpus, cpu)) )
+                xfree(per_cpu(cluster_cpus, cpu));
+        }
+        free_cpumask_var(per_cpu(scratch_mask, cpu));
+        break;
+    }
+
+    return !err ? NOTIFY_DONE : notifier_from_errno(err);
+}
+
+static struct notifier_block x2apic_cpu_nfb = {
+   .notifier_call = update_clusterinfo
+};
+
 const struct genapic *__init apic_x2apic_probe(void)
 {
-    return x2apic_phys ? &apic_x2apic_phys : &apic_x2apic_cluster;
+    if ( x2apic_phys )
+        return &apic_x2apic_phys;
+
+    if ( !this_cpu(cluster_cpus) )
+    {
+        update_clusterinfo(NULL, CPU_UP_PREPARE,
+                           (void *)(long)smp_processor_id());
+        init_apic_ldr_x2apic_cluster();
+        register_cpu_notifier(&x2apic_cpu_nfb);
+    }
+
+    return &apic_x2apic_cluster;
 }
 
 void __init check_x2apic_preenabled(void)